From d100af9d0db9f710a81b80988056e1a47fbb141e Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Sat, 11 Jul 2015 01:58:10 +0800 Subject: [PATCH 001/124] This is yibing's first patch. removed all cuda files and added device file --- Makefile | 16 +- include/caffe/common.hpp | 25 +- include/caffe/device.hpp | 47 ++ include/caffe/util/math_functions.hpp | 280 +++++++----- include/caffe/util/math_functions.hpp.protect | 280 ++++++++++++ src/caffe/common.cpp | 22 +- src/caffe/device.cpp | 421 +++++++++++++++++ src/caffe/layers/conv_layer.cpp | 10 + .../layers/{ => cufiles}/absval_layer.cu | 0 .../layers/{ => cufiles}/base_data_layer.cu | 0 src/caffe/layers/{ => cufiles}/bnll_layer.cu | 0 .../layers/{ => cufiles}/concat_layer.cu | 0 .../{ => cufiles}/contrastive_loss_layer.cu | 0 src/caffe/layers/{ => cufiles}/conv_layer.cu | 0 .../layers/{ => cufiles}/cudnn_conv_layer.cu | 0 .../{ => cufiles}/cudnn_pooling_layer.cu | 0 .../layers/{ => cufiles}/cudnn_relu_layer.cu | 0 .../{ => cufiles}/cudnn_sigmoid_layer.cu | 0 .../{ => cufiles}/cudnn_softmax_layer.cu | 0 .../layers/{ => cufiles}/cudnn_tanh_layer.cu | 0 .../layers/{ => cufiles}/deconv_layer.cu | 0 .../layers/{ => cufiles}/dropout_layer.cu | 0 .../layers/{ => cufiles}/eltwise_layer.cu | 0 .../{ => cufiles}/euclidean_loss_layer.cu | 0 src/caffe/layers/{ => cufiles}/exp_layer.cu | 0 .../layers/{ => cufiles}/filter_layer.cu | 0 .../layers/{ => cufiles}/hdf5_data_layer.cu | 0 .../layers/{ => cufiles}/hdf5_output_layer.cu | 0 .../layers/{ => cufiles}/im2col_layer.cu | 0 .../{ => cufiles}/inner_product_layer.cu | 0 src/caffe/layers/{ => cufiles}/log_layer.cu | 0 src/caffe/layers/{ => cufiles}/lrn_layer.cu | 0 src/caffe/layers/{ => cufiles}/mvn_layer.cu | 0 .../layers/{ => cufiles}/pooling_layer.cu | 0 src/caffe/layers/{ => cufiles}/power_layer.cu | 0 src/caffe/layers/{ => cufiles}/prelu_layer.cu | 0 .../layers/{ => cufiles}/reduction_layer.cu | 0 src/caffe/layers/{ => cufiles}/relu_layer.cu | 0 .../sigmoid_cross_entropy_loss_layer.cu | 0 .../layers/{ => cufiles}/sigmoid_layer.cu | 0 .../layers/{ => cufiles}/silence_layer.cu | 0 src/caffe/layers/{ => cufiles}/slice_layer.cu | 0 .../layers/{ => cufiles}/softmax_layer.cu | 0 .../{ => cufiles}/softmax_loss_layer.cu | 0 src/caffe/layers/{ => cufiles}/split_layer.cu | 0 src/caffe/layers/{ => cufiles}/tanh_layer.cu | 0 .../layers/{ => cufiles}/threshold_layer.cu | 0 src/caffe/layers/dropout_layer.cpp | 10 + src/caffe/layers/pooling_layer.cpp | 9 + src/caffe/layers/relu_layer.cpp | 10 + src/caffe/syncedmem.cpp | 11 +- src/caffe/util/math_functions.cpp | 430 ++++++++++++++---- src/caffe/util/math_functions.cpp.protect | 413 +++++++++++++++++ 53 files changed, 1771 insertions(+), 213 deletions(-) create mode 100644 include/caffe/device.hpp create mode 100644 include/caffe/util/math_functions.hpp.protect create mode 100644 src/caffe/device.cpp rename src/caffe/layers/{ => cufiles}/absval_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/base_data_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/bnll_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/concat_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/contrastive_loss_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/conv_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_conv_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_pooling_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_relu_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_sigmoid_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_softmax_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/cudnn_tanh_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/deconv_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/dropout_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/eltwise_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/euclidean_loss_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/exp_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/filter_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/hdf5_data_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/hdf5_output_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/im2col_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/inner_product_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/log_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/lrn_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/mvn_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/pooling_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/power_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/prelu_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/reduction_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/relu_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/sigmoid_cross_entropy_loss_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/sigmoid_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/silence_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/slice_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/softmax_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/softmax_loss_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/split_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/tanh_layer.cu (100%) rename src/caffe/layers/{ => cufiles}/threshold_layer.cu (100%) create mode 100644 src/caffe/util/math_functions.cpp.protect diff --git a/Makefile b/Makefile index 05b783af..80c5642d 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so # CXX_SRCS are the source files excluding the test ones. CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp") # CU_SRCS are the cuda source files -CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu") +#CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu") # TEST_SRCS are the test source files TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp") @@ -525,13 +525,13 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \ || (cat $@.$(WARNS_EXT); exit 1) @ cat $@.$(WARNS_EXT) -$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS) - @ echo NVCC $< - $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ - -odir $(@D) - $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \ - || (cat $@.$(WARNS_EXT); exit 1) - @ cat $@.$(WARNS_EXT) +#$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS) +# @ echo NVCC $< +# $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ +# -odir $(@D) +# $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \ +# || (cat $@.$(WARNS_EXT); exit 1) +# @ cat $@.$(WARNS_EXT) $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ | $(DYNAMIC_NAME) $(TEST_BIN_DIR) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 5f86bc26..b1528474 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -1,6 +1,7 @@ #ifndef CAFFE_COMMON_HPP_ #define CAFFE_COMMON_HPP_ +#include #include #include #include @@ -15,7 +16,10 @@ #include #include // pair #include +#include +#include +#include "caffe/device.hpp" #include "caffe/util/device_alternate.hpp" // gflags 2.1 issue: namespace google was changed to gflags without warning. @@ -65,6 +69,25 @@ private:\ // is executed we will see a fatal log. #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet" +// OpenCL: various checks for different function calls. +#define OCL_CHECK(condition) \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << error; \ + if(CL_SUCCESS != error){ \ + LOG(INFO) << "failed";\ + } \ + } while (0) + +#define CLBLAS_CHECK(flag) \ + do { \ + cl_int error = flag; \ + CHECK_EQ(error, clblasSuccess) << " " << error; \ + if (error != clblasSuccess){ \ + LOG(INFO) << "clBlas Function Failed! Error Code:" << error; \ + } \ + } while(0) + // See PR #1236 namespace cv { class Mat; } @@ -104,7 +127,7 @@ class Caffe { } return *singleton_; } - enum Brew { CPU, GPU }; + enum Brew { CPU, GPU, APU }; // This random number generator facade hides boost and CUDA rng // implementation from one another (for cross-platform compatibility). diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp new file mode 100644 index 00000000..07e65848 --- /dev/null +++ b/include/caffe/device.hpp @@ -0,0 +1,47 @@ +#ifndef CAFFE_DEVICE_HPP +#define CAFFE_DEVICE_HPP +#include +#include +#include +#include "caffe/common.hpp" +namespace caffe { + +class Device{ +public: + Device():numPlatforms(0),numDevices(0){} + ~Device(); + cl_uint numPlatforms; + cl_platform_id * platformIDs; + char platformName[64]; + char openclVersion[64]; + cl_uint numDevices; + cl_device_id * DeviceIDs; + cl_context Context; + cl_command_queue CommandQueue; + cl_command_queue CommandQueue_helper; + cl_program Program; + clblasOrder col; + clblasOrder row; + + + cl_int Init(); + cl_int ConvertToString(const char *pFileName,std::string &Str); + void DisplayPlatformInfo(); + void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); + + void GetDeviceInfo(); + + template + void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str); + template + void appendBitfield(T info, T value, std::string name, std::string &str); + + +}; +extern char* buildOption; +extern Device amdDevice; + +} // namespace caffe + +#endif //CAFFE_DEVICE_HPP + diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 2cacd8e7..bcafeb89 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -1,18 +1,19 @@ +// Copyright 2014 BVLC and contributors. + #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ #define CAFFE_UTIL_MATH_FUNCTIONS_H_ #include #include // for std::fabs and std::signbit - +#include +#include #include "glog/logging.h" -#include "caffe/common.hpp" -#include "caffe/util/device_alternate.hpp" #include "caffe/util/mkl_alternate.hpp" namespace caffe { -// Caffe gemm provides a simpler interface to the gemm functions, with the +// Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, @@ -20,35 +21,97 @@ void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); +// Decaf gpu gemm provides an interface that is almost the same as the cpu +// gemm function - following the c convention and calling the fortran-order +// gpu code under the hood. +template +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +cl_event caffe_gpu_gemmex( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, + Dtype* C, const int offC); +/*This is Yuan Gao's sgemm_ex*/ +template +void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C, const int offset1, const int offset2, const int offset3); + + +template +cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, + Dtype* C, const int offC); + template void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y); +template +void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, + const Dtype * x, size_t offx, const Dtype beta, int incx, + Dtype* y, size_t offy, int incy); + +template +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + + template void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); +template +void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + template void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, Dtype* Y); +template +void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + template void caffe_copy(const int N, const Dtype *X, Dtype *Y); template void caffe_set(const int N, const Dtype alpha, Dtype *X); -inline void caffe_memset(const size_t N, const int alpha, void* X) { - memset(X, alpha, N); // NOLINT(caffe/alt_fn) -} +template +void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); + template void caffe_scal(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); + template void caffe_sqr(const int N, const Dtype* a, Dtype* y); @@ -61,12 +124,35 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); template void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); +//template +//void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, Dtype* scratch_buf); +//CUDA version, need to be deleted +template +void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_mul(cl_kernel Kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y); + template void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); +//CUDA version, need to be deleted +template +void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y); + template void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); +//CUDA version, need to be deleted +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + +template +void caffe_gpu_powx(cl_kernel Kernel, const int n, const Dtype* a, const Dtype b, Dtype* y); + unsigned int caffe_rng_rand(); template @@ -75,43 +161,62 @@ Dtype caffe_nextafter(const Dtype b); template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); +// caffe_gpu_rng_uniform with two arguments generates integers in the range +// [0, UINT_MAX]. +void caffe_gpu_rng_uniform(const int n, unsigned int* r); + +// caffe_gpu_rng_uniform with four arguments generates floats in the range +// (a, b] (strictly greater than a, less than or equal to b) due to the +// specification of curandGenerateUniform. With a = 0, b = 1, just calls +// curandGenerateUniform; with other limits will shift and scale the outputs +// appropriately after calling curandGenerateUniform. +template +void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + template void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, Dtype* r); template -void caffe_rng_bernoulli(const int n, const Dtype p, int* r); +void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); template -void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); +void caffe_rng_bernoulli(const int n, const Dtype p, int* r); template -void caffe_exp(const int n, const Dtype* a, Dtype* y); +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); template -void caffe_log(const int n, const Dtype* a, Dtype* y); +void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); template -void caffe_abs(const int n, const Dtype* a, Dtype* y); +void caffe_exp(const int n, const Dtype* a, Dtype* y); template Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); template -Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, - const Dtype* y, const int incy); +void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); template int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); +template +uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, + const Dtype* y); + // Returns the sum of the absolute values of the elements of vector x template Dtype caffe_cpu_asum(const int n, const Dtype* x); +template +void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); + // the branchless, type-safe version from // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c template -inline int8_t caffe_sign(Dtype val) { +inline char caffe_sign(Dtype val) { return (Dtype(0) < val) - (val < Dtype(0)); } @@ -130,63 +235,57 @@ inline int8_t caffe_sign(Dtype val) { } \ } +#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \ + template <> \ + void caffe_cpu_##name(const int n, const float* x, float* y); \ + template <> \ + void caffe_cpu_##name(const int n, const double* x, double* y) + +/* +#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ +template \ +__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + CUDA_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const float* x, float* y) { \ + NOLINT_NEXT_LINE(whitespace/operators) \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const double* x, double* y) { \ + NOLINT_NEXT_LINE(whitespace/operators) \ + name##_kernel<<>>( \ + n, x, y); \ +} +*/ // output is 1 for the positives, 0 for zero, and -1 for the negatives DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); -// This returns a nonzero value if the input has its sign bit set. -// The name sngbit is meant to avoid conflicts with std::signbit in the macro. -// The extra parens are needed because CUDA < 6.5 defines signbit as a macro, -// and we don't want that to expand here when CUDA headers are also included. -DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ - y[i] = static_cast((std::signbit)(x[i]))); - -DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); - -template -void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); - -#ifndef CPU_ONLY // GPU - -// Decaf gpu gemm provides an interface that is almost the same as the cpu -// gemm function - following the c convention and calling the fortran-order -// gpu code under the hood. -template -void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); - -template -void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); +template +void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); -template -void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +// This returns a nonzero value if the input has its sign bit set. +// The name sngbit is meant to avoid conflicts with std::signbit in the macro +using std::signbit; +DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); -template -void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); +template +void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); -void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); +DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); template -void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); - -inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) -#else - NO_GPU; -#endif -} +void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); template -void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); +void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template -void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); @@ -212,69 +311,20 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); -// caffe_gpu_rng_uniform with two arguments generates integers in the range -// [0, UINT_MAX]. -void caffe_gpu_rng_uniform(const int n, unsigned int* r); - -// caffe_gpu_rng_uniform with four arguments generates floats in the range -// (a, b] (strictly greater than a, less than or equal to b) due to the -// specification of curandGenerateUniform. With a = 0, b = 1, just calls -// curandGenerateUniform; with other limits will shift and scale the outputs -// appropriately after calling curandGenerateUniform. -template -void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); - template -void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); - -template -void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); - -template -void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); +void caffe_exp(const int n, const Dtype* a, Dtype* y); -template -uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, - const Dtype* y); template -void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); - -template -void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); - -template -void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); +void caffe_abs(const int n, const Dtype* a, Dtype* y); template -void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); +void caffe_log(const int n, const Dtype* a, Dtype* y); template -void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); - -#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ -template \ -__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ - CUDA_KERNEL_LOOP(index, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_gpu_##name(const int n, const float* x, float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template <> \ -void caffe_gpu_##name(const int n, const double* x, double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} - -#endif // !CPU_ONLY - +Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, + const Dtype* y, const int incy); } // namespace caffe + #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/math_functions.hpp.protect b/include/caffe/util/math_functions.hpp.protect new file mode 100644 index 00000000..2cacd8e7 --- /dev/null +++ b/include/caffe/util/math_functions.hpp.protect @@ -0,0 +1,280 @@ +#ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ +#define CAFFE_UTIL_MATH_FUNCTIONS_H_ + +#include +#include // for std::fabs and std::signbit + +#include "glog/logging.h" + +#include "caffe/common.hpp" +#include "caffe/util/device_alternate.hpp" +#include "caffe/util/mkl_alternate.hpp" + +namespace caffe { + +// Caffe gemm provides a simpler interface to the gemm functions, with the +// limitation that the data has to be contiguous in memory. +template +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + +template +void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + +template +void caffe_copy(const int N, const Dtype *X, Dtype *Y); + +template +void caffe_set(const int N, const Dtype alpha, Dtype *X); + +inline void caffe_memset(const size_t N, const int alpha, void* X) { + memset(X, alpha, N); // NOLINT(caffe/alt_fn) +} + +template +void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_scal(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_sqr(const int N, const Dtype* a, Dtype* y); + +template +void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + +unsigned int caffe_rng_rand(); + +template +Dtype caffe_nextafter(const Dtype b); + +template +void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + +template +void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); + +template +void caffe_exp(const int n, const Dtype* a, Dtype* y); + +template +void caffe_log(const int n, const Dtype* a, Dtype* y); + +template +void caffe_abs(const int n, const Dtype* a, Dtype* y); + +template +Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); + +template +Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, + const Dtype* y, const int incy); + +template +int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); + +// Returns the sum of the absolute values of the elements of vector x +template +Dtype caffe_cpu_asum(const int n, const Dtype* x); + +// the branchless, type-safe version from +// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c +template +inline int8_t caffe_sign(Dtype val) { + return (Dtype(0) < val) - (val < Dtype(0)); +} + +// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC +// in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp. +// Please refer to commit 7e8ef25c7 of the boost-eigen branch. +// Git cherry picking that commit caused a conflict hard to resolve and +// copying that file in convenient for code reviewing. +// So they have to be pasted here temporarily. +#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \ + template \ + void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \ + CHECK_GT(n, 0); CHECK(x); CHECK(y); \ + for (int i = 0; i < n; ++i) { \ + operation; \ + } \ + } + +// output is 1 for the positives, 0 for zero, and -1 for the negatives +DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); + +// This returns a nonzero value if the input has its sign bit set. +// The name sngbit is meant to avoid conflicts with std::signbit in the macro. +// The extra parens are needed because CUDA < 6.5 defines signbit as a macro, +// and we don't want that to expand here when CUDA headers are also included. +DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ + y[i] = static_cast((std::signbit)(x[i]))); + +DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); + +template +void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +#ifndef CPU_ONLY // GPU + +// Decaf gpu gemm provides an interface that is almost the same as the cpu +// gemm function - following the c convention and calling the fortran-order +// gpu code under the hood. +template +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + +template +void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); + +template +void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); + +inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { +#ifndef CPU_ONLY + CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) +#else + NO_GPU; +#endif +} + +template +void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + +// caffe_gpu_rng_uniform with two arguments generates integers in the range +// [0, UINT_MAX]. +void caffe_gpu_rng_uniform(const int n, unsigned int* r); + +// caffe_gpu_rng_uniform with four arguments generates floats in the range +// (a, b] (strictly greater than a, less than or equal to b) due to the +// specification of curandGenerateUniform. With a = 0, b = 1, just calls +// curandGenerateUniform; with other limits will shift and scale the outputs +// appropriately after calling curandGenerateUniform. +template +void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + +template +void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); + +template +void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); + +template +void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); + +template +uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, + const Dtype* y); + +template +void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ +template \ +__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + CUDA_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const float* x, float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const double* x, double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} + +#endif // !CPU_ONLY + +} // namespace caffe + +#endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index af96cac4..e53a5c0d 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -85,7 +85,8 @@ void* Caffe::RNG::generator() { #else // Normal GPU + CPU Caffe. Caffe::Caffe() - : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), +{ +/* : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), mode_(Caffe::CPU) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). @@ -99,18 +100,20 @@ Caffe::Caffe() != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } +*/ } Caffe::~Caffe() { - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + /* if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } +*/ } void Caffe::set_random_seed(const unsigned int seed) { // Curand seed - static bool g_curand_availability_logged = false; + /* static bool g_curand_availability_logged = false; if (Get().curand_generator_) { CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(), seed)); @@ -124,10 +127,11 @@ void Caffe::set_random_seed(const unsigned int seed) { } // RNG seed Get().random_generator_.reset(new RNG(seed)); +*/ } void Caffe::SetDevice(const int device_id) { - int current_device; + /* int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; @@ -144,10 +148,11 @@ void Caffe::SetDevice(const int device_id) { CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); +*/ } void Caffe::DeviceQuery() { - cudaDeviceProp prop; + /*cudaDeviceProp prop; int device; if (cudaSuccess != cudaGetDevice(&device)) { printf("No cuda device present.\n"); @@ -179,6 +184,7 @@ void Caffe::DeviceQuery() { LOG(INFO) << "Kernel execution timeout: " << (prop.kernelExecTimeoutEnabled ? "Yes" : "No"); return; +*/ } @@ -205,7 +211,7 @@ void* Caffe::RNG::generator() { } const char* cublasGetErrorString(cublasStatus_t error) { - switch (error) { + /* switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: @@ -231,11 +237,12 @@ const char* cublasGetErrorString(cublasStatus_t error) { return "CUBLAS_STATUS_LICENSE_ERROR"; #endif } +*/ return "Unknown cublas status"; } const char* curandGetErrorString(curandStatus_t error) { - switch (error) { + /*switch (error) { case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; case CURAND_STATUS_VERSION_MISMATCH: @@ -263,6 +270,7 @@ const char* curandGetErrorString(curandStatus_t error) { case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; } +*/ return "Unknown curand status"; } diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp new file mode 100644 index 00000000..7c564589 --- /dev/null +++ b/src/caffe/device.cpp @@ -0,0 +1,421 @@ +#include "caffe/common.hpp" +#include "caffe/device.hpp" +#include +#include +#include +#include +namespace caffe { +//delete it after test, Yibing +cl_mem test_alloc_mem[10]; +extern long long unsigned device_mem_consumption; + +Device amdDevice; +char* buildOption = "-x clc++ "; + +Device::~Device(){ + //clAmdBlasTeardown(); + free((void*)platformIDs); + free(DeviceIDs); + clReleaseProgram(Program); + clReleaseCommandQueue(CommandQueue); + clReleaseCommandQueue(CommandQueue_helper); + clReleaseContext(Context); + LOG(INFO) << "device destructor"; +} + + +cl_int Device::Init(){ + + //Get Platform Infomation + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); + if(res != CL_SUCCESS){ + fprintf(stderr, "Err: Failed to Get Platform Info\n", res); + return 0; + } + platformName[nameLen] = 0; + + //Get OpenCL Information + //res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_VERSION, 64, openclVersion, &nameLen); + //if(res != CL_SUCCESS) { + // fprintf(stderr, "Err: Get OpenCL Info failed!\n", res); + // return 0; + //} + //openclVersion[nameLen] = 0; + //printf("%s %s\n", platformName, openclVersion); + + GetDeviceInfo(); + cl_device_id * pDevices; + cl_uint uiNumDevices; + cl_bool unified_memory = false; + switch(Caffe::mode()) { + case Caffe::GPU: + //choose_gpu(); + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if(0 == uiNumDevices){ + LOG(FATAL) << "Err: No GPU devices"; + } + else{ + pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); + OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices)); + for (int i = 0; i < (int)uiNumDevices; i++){ + clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); + if(unified_memory) //skip iGPU + continue; + else {//we pick the first GPU we found + pDevices[0] = pDevices[i]; + } + } + } + LOG(INFO) << "picked device type: GPU"; + break; + case Caffe::CPU: + //choose_cpu(); + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if(0 == uiNumDevices){ + LOG(FATAL) << "Err: No CPU devices"; + } + pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); + OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) ); + LOG(INFO) << "picked device type: CPU"; + break; + case Caffe::APU: + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if(0 == uiNumDevices){ + LOG(FATAL) << "Err: No GPU devices"; + } + else{ + pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); + OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices)); + for (int i = 0; i < (int)uiNumDevices; i++){ + clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); + if(unified_memory) //we pick the first GPU we found + pDevices[0] = pDevices[i]; + else {//skip dGPU + continue; + } + } + } + LOG(INFO) << "picked device type: APU"; + break; + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } + + //Create Context + Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); + if(NULL == Context){ + fprintf(stderr,"Err: Failed to Create Context\n"); + return 0; + } + + //Create CommandQueue + CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL); + CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL); + if(NULL == CommandQueue || NULL == CommandQueue_helper){ + fprintf(stderr,"Err: Failed to Create Commandqueue\n"); + return 0; + } + + //Read our own kernel file + const char *pFileName = "../../src/caffe/OCL_kernel.cl"; + const char *pSource; + std::string strSource = ""; + ConvertToString(pFileName, strSource); + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = {0}; + uiArrSourceSize[0] = strlen(pSource); + Program = NULL; + Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL); + if(NULL == Program){ + fprintf(stderr,"Err: Failed to create program\n"); + } + + //Build Program + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL); + LOG(INFO) << "Build Program"; + if(CL_SUCCESS != iStatus){ + fprintf(stderr,"Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram(Program); + } + + /* + //Setup AmdBlas; + cl_int err; + err = clAmdBlasSetup(); + if(err != CL_SUCCESS){ + printf("clAmdBlasSetup() failed with %d\n", err); + } + */ + row = clblasRowMajor; + col = clblasColumnMajor; + + /* + //delete after test the large buffer allocation, Yibing + long long global_mem_size_limit = 1024*1024; //4*1024*1024*1024; + global_mem_size_limit *= (long long)(0.0*1024.0); + //global_mem_size_limit = 16834887680/2; + long long available_global_mem_size = 1024*1024; + available_global_mem_size *= 20*1024; + + long long global_mem_malloc_size_limit = 1024*1024; + while(available_global_mem_size > global_mem_size_limit){ + long long size_; + if((available_global_mem_size - global_mem_size_limit) >= global_mem_malloc_size_limit){ + size_ = global_mem_malloc_size_limit; + }else{ + size_ = available_global_mem_size - global_mem_size_limit; + } + available_global_mem_size = available_global_mem_size - size_; + int *tmpData = (int *)malloc(size_); + cl_int err; + int i = 0; + test_alloc_mem[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, size_, NULL, &err); + err = clEnqueueWriteBuffer(CommandQueue, test_alloc_mem[i], CL_TRUE, 0, size_, tmpData, 0, NULL, NULL); + i++; + device_mem_consumption += size_; + //printf("self alloc, device_mem_consumption = %lu\n", device_mem_consumption); + if(err != CL_SUCCESS) { + printf("Large Buffer Allocation failed! error_code = %d\n", err); + printf("self alloc, device_mem_consumption = %llu\n", device_mem_consumption); + exit(1); + } + + cl_ulong free_mem_size, mem_size; + cl_int err1 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,sizeof(cl_ulong),&free_mem_size,NULL); + cl_int err2 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&mem_size,NULL); + //std::cout<<"free memory size after allocation = "<(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + } + + +} + +template +void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){ + cl_int err; + std::size_t paramValueSize; + + err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); + if(err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + std::string content; + T * info = (T *) alloca (sizeof(T) * paramValueSize); + err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); + if(err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + + switch(name) +{ + case CL_DEVICE_TYPE: + { + std::string deviceType; + appendBitfield( + *(reinterpret_cast(info)),CL_DEVICE_TYPE_CPU,"CL_DEVICE_TYPE_CPU",deviceType); + + appendBitfield( + *(reinterpret_cast(info)),CL_DEVICE_TYPE_GPU,"CL_DEVICE_TYPE_GPU",deviceType); + + appendBitfield( + *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_ACCELERATOR,"CL_DEVICE_TYPE_ACCELERATOR",deviceType); + + appendBitfield( + *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_DEFAULT,"CL_DEVICE_TYPE_DEFAULT",deviceType); + + LOG(INFO) << "\t " << str << ":\t" << deviceType; + } + break; + case CL_DEVICE_EXECUTION_CAPABILITIES: + { + std::string memType; + appendBitfield( + *(reinterpret_cast(info)),CL_EXEC_KERNEL,"CL_EXEC_KERNEL",memType); + + appendBitfield( + *(reinterpret_cast(info)),CL_EXEC_NATIVE_KERNEL,"CL_EXEC_NATIVE_KERNEL",memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + + } + break; + case CL_DEVICE_QUEUE_PROPERTIES: + { + std::string memType; + appendBitfield(*(reinterpret_cast(info)),CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,"CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE",memType); + + appendBitfield(*(reinterpret_cast(info)),CL_QUEUE_PROFILING_ENABLE,"CL_QUEUE_PROFILING_ENABLE",memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + } + break; + default: + LOG(INFO) << "\t" << str << ":\t" << *info; + break; +} + +} + +template +void Device::appendBitfield(T info, T value , std::string name , std::string &str) +{ + if(info & value) + { + if (str.length() > 0) + { + str.append(" | "); + } + str.append(name); + } +} + + +} // namespace caffe + diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 928ef5ee..b73f1a93 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -67,6 +67,16 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } +template +void ConvolutionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ConvolutionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ConvolutionLayer); #endif diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu similarity index 100% rename from src/caffe/layers/absval_layer.cu rename to src/caffe/layers/cufiles/absval_layer.cu diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu similarity index 100% rename from src/caffe/layers/base_data_layer.cu rename to src/caffe/layers/cufiles/base_data_layer.cu diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu similarity index 100% rename from src/caffe/layers/bnll_layer.cu rename to src/caffe/layers/cufiles/bnll_layer.cu diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu similarity index 100% rename from src/caffe/layers/concat_layer.cu rename to src/caffe/layers/cufiles/concat_layer.cu diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu similarity index 100% rename from src/caffe/layers/contrastive_loss_layer.cu rename to src/caffe/layers/cufiles/contrastive_loss_layer.cu diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu similarity index 100% rename from src/caffe/layers/conv_layer.cu rename to src/caffe/layers/cufiles/conv_layer.cu diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_conv_layer.cu rename to src/caffe/layers/cufiles/cudnn_conv_layer.cu diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_pooling_layer.cu rename to src/caffe/layers/cufiles/cudnn_pooling_layer.cu diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_relu_layer.cu rename to src/caffe/layers/cufiles/cudnn_relu_layer.cu diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_sigmoid_layer.cu rename to src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_softmax_layer.cu rename to src/caffe/layers/cufiles/cudnn_softmax_layer.cu diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu similarity index 100% rename from src/caffe/layers/cudnn_tanh_layer.cu rename to src/caffe/layers/cufiles/cudnn_tanh_layer.cu diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu similarity index 100% rename from src/caffe/layers/deconv_layer.cu rename to src/caffe/layers/cufiles/deconv_layer.cu diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu similarity index 100% rename from src/caffe/layers/dropout_layer.cu rename to src/caffe/layers/cufiles/dropout_layer.cu diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu similarity index 100% rename from src/caffe/layers/eltwise_layer.cu rename to src/caffe/layers/cufiles/eltwise_layer.cu diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu similarity index 100% rename from src/caffe/layers/euclidean_loss_layer.cu rename to src/caffe/layers/cufiles/euclidean_loss_layer.cu diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu similarity index 100% rename from src/caffe/layers/exp_layer.cu rename to src/caffe/layers/cufiles/exp_layer.cu diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu similarity index 100% rename from src/caffe/layers/filter_layer.cu rename to src/caffe/layers/cufiles/filter_layer.cu diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu similarity index 100% rename from src/caffe/layers/hdf5_data_layer.cu rename to src/caffe/layers/cufiles/hdf5_data_layer.cu diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu similarity index 100% rename from src/caffe/layers/hdf5_output_layer.cu rename to src/caffe/layers/cufiles/hdf5_output_layer.cu diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu similarity index 100% rename from src/caffe/layers/im2col_layer.cu rename to src/caffe/layers/cufiles/im2col_layer.cu diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu similarity index 100% rename from src/caffe/layers/inner_product_layer.cu rename to src/caffe/layers/cufiles/inner_product_layer.cu diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu similarity index 100% rename from src/caffe/layers/log_layer.cu rename to src/caffe/layers/cufiles/log_layer.cu diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu similarity index 100% rename from src/caffe/layers/lrn_layer.cu rename to src/caffe/layers/cufiles/lrn_layer.cu diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu similarity index 100% rename from src/caffe/layers/mvn_layer.cu rename to src/caffe/layers/cufiles/mvn_layer.cu diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu similarity index 100% rename from src/caffe/layers/pooling_layer.cu rename to src/caffe/layers/cufiles/pooling_layer.cu diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu similarity index 100% rename from src/caffe/layers/power_layer.cu rename to src/caffe/layers/cufiles/power_layer.cu diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu similarity index 100% rename from src/caffe/layers/prelu_layer.cu rename to src/caffe/layers/cufiles/prelu_layer.cu diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu similarity index 100% rename from src/caffe/layers/reduction_layer.cu rename to src/caffe/layers/cufiles/reduction_layer.cu diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu similarity index 100% rename from src/caffe/layers/relu_layer.cu rename to src/caffe/layers/cufiles/relu_layer.cu diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu similarity index 100% rename from src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu rename to src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu similarity index 100% rename from src/caffe/layers/sigmoid_layer.cu rename to src/caffe/layers/cufiles/sigmoid_layer.cu diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu similarity index 100% rename from src/caffe/layers/silence_layer.cu rename to src/caffe/layers/cufiles/silence_layer.cu diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu similarity index 100% rename from src/caffe/layers/slice_layer.cu rename to src/caffe/layers/cufiles/slice_layer.cu diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu similarity index 100% rename from src/caffe/layers/softmax_layer.cu rename to src/caffe/layers/cufiles/softmax_layer.cu diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu similarity index 100% rename from src/caffe/layers/softmax_loss_layer.cu rename to src/caffe/layers/cufiles/softmax_loss_layer.cu diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu similarity index 100% rename from src/caffe/layers/split_layer.cu rename to src/caffe/layers/cufiles/split_layer.cu diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu similarity index 100% rename from src/caffe/layers/tanh_layer.cu rename to src/caffe/layers/cufiles/tanh_layer.cu diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu similarity index 100% rename from src/caffe/layers/threshold_layer.cu rename to src/caffe/layers/cufiles/threshold_layer.cu diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index ec1256fd..7f1ac8f6 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -67,6 +67,16 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } +template +void DropoutLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void DropoutLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(DropoutLayer); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index c8d41499..d5207889 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -309,6 +309,15 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } } +template +void PoolingLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void PoolingLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} #ifdef CPU_ONLY STUB_GPU(PoolingLayer); diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index cc00319a..e05080bf 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -36,6 +36,16 @@ void ReLULayer::Backward_cpu(const vector*>& top, } } +template +void ReLULayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ReLULayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ReLULayer); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 7617ccfb..200ca657 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -7,7 +7,7 @@ namespace caffe { SyncedMemory::~SyncedMemory() { - if (cpu_ptr_ && own_cpu_data_) { +/* if (cpu_ptr_ && own_cpu_data_) { CaffeFreeHost(cpu_ptr_); } @@ -16,10 +16,11 @@ SyncedMemory::~SyncedMemory() { CUDA_CHECK(cudaFree(gpu_ptr_)); } #endif // CPU_ONLY +*/ } inline void SyncedMemory::to_cpu() { - switch (head_) { + /* switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); caffe_memset(size_, 0, cpu_ptr_); @@ -42,9 +43,11 @@ inline void SyncedMemory::to_cpu() { case SYNCED: break; } +*/ } inline void SyncedMemory::to_gpu() { +/* #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: @@ -66,6 +69,7 @@ inline void SyncedMemory::to_gpu() { #else NO_GPU; #endif +*/ } const void* SyncedMemory::cpu_data() { @@ -74,13 +78,14 @@ const void* SyncedMemory::cpu_data() { } void SyncedMemory::set_cpu_data(void* data) { - CHECK(data); + /*CHECK(data); if (own_cpu_data_) { CaffeFreeHost(cpu_ptr_); } cpu_ptr_ = data; head_ = HEAD_AT_CPU; own_cpu_data_ = false; +*/ } const void* SyncedMemory::gpu_data() { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 0aab6b17..6cbf208d 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -1,3 +1,5 @@ +// Copyright 2014 BVLC and contributors. + #include #include @@ -7,6 +9,9 @@ #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" +static const clblasOrder order = clblasColumnMajor; +#define pi 3.1415926 + namespace caffe { template<> @@ -31,6 +36,92 @@ void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, ldb, beta, C, N); } +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} + +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} + +template <> +cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) ); + return event; +} + +template <> +cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) ); + return event; +} + + +template <> +cl_event caffe_gpu_gemmex(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) ); + CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); + return event; + } + +template <> +cl_event caffe_gpu_gemmex(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) ); + CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); + return event; +} + template <> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, @@ -45,6 +136,42 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } +template <> +void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, size_t offA, int lda, + const float* x, size_t offx, const float beta, int incx, + float* y, size_t offy, int incy) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, + M, N, (cl_float)alpha, (cl_mem)A, offA, lda, + (cl_mem)x, offx, incx, (cl_float)beta, + (cl_mem)y, offy, incy, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} + +template <> +void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, size_t offA, int lda, + const double* x, size_t offx, const double beta, int incx, + double* y, size_t offy, int incy) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + +} + + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { +} + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { +} + template <> void caffe_axpy(const int N, const float alpha, const float* X, float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } @@ -53,10 +180,22 @@ template <> void caffe_axpy(const int N, const double alpha, const double* X, double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } -template -void caffe_set(const int N, const Dtype alpha, Dtype* Y) { +template <> +void caffe_gpu_axpy(const int N, const float alpha, const float* X, + float* Y) { + CLBLAS_CHECK( clblasSaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) ); +} + +template <> +void caffe_gpu_axpy(const int N, const double alpha, const double* X, + double* Y) { + CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) ); +} + +template <> +void caffe_set(const int N, const float alpha, float* Y) { if (alpha == 0) { - memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + memset(Y, 0, sizeof(float) * N); return; } for (int i = 0; i < N; ++i) { @@ -64,9 +203,16 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) { } } -template void caffe_set(const int N, const int alpha, int* Y); -template void caffe_set(const int N, const float alpha, float* Y); -template void caffe_set(const int N, const double alpha, double* Y); +template <> +void caffe_set(const int N, const double alpha, double* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(double) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { @@ -82,27 +228,26 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) { } } -template -void caffe_copy(const int N, const Dtype* X, Dtype* Y) { - if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); -#else - NO_GPU; -#endif - } else { - memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } - } +template <> +void caffe_copy(const int N, const float* X, float* Y) { + cblas_scopy(N, X, 1, Y, 1); } -template void caffe_copy(const int N, const int* X, int* Y); -template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); -template void caffe_copy(const int N, const float* X, float* Y); -template void caffe_copy(const int N, const double* X, double* Y); +template <> +void caffe_copy(const int N, const double* X, double* Y) { + cblas_dcopy(N, X, 1, Y, 1); +} + +template <> +void caffe_gpu_copy(const int N, const float* X, float* Y) { + CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + +} + +template <> +void caffe_gpu_copy(const int N, const double* X, double* Y) { + CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} template <> void caffe_scal(const int N, const float alpha, float *X) { @@ -114,6 +259,30 @@ void caffe_scal(const int N, const double alpha, double *X) { cblas_dscal(N, alpha, X, 1); } +template <> +void caffe_gpu_scal(const int N, const float alpha, float *X) { + CLBLAS_CHECK(clblasSscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_scal(const int N, const double alpha, double *X) { + CLBLAS_CHECK(clblasDscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} + +template <> +void caffe_gpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} + template <> void caffe_cpu_axpby(const int N, const float alpha, const float* X, const float beta, float* Y) { @@ -206,26 +375,6 @@ void caffe_exp(const int n, const double* a, double* y) { vdExp(n, a, y); } -template <> -void caffe_log(const int n, const float* a, float* y) { - vsLn(n, a, y); -} - -template <> -void caffe_log(const int n, const double* a, double* y) { - vdLn(n, a, y); -} - -template <> -void caffe_abs(const int n, const float* a, float* y) { - vsAbs(n, a, y); -} - -template <> -void caffe_abs(const int n, const double* a, double* y) { - vdAbs(n, a, y); -} - unsigned int caffe_rng_rand() { return (*caffe_rng())(); } @@ -253,6 +402,8 @@ void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } + + //LOG(INFO) << "caffe_rng_uniform"; } template @@ -272,9 +423,11 @@ void caffe_rng_gaussian(const int n, const Dtype a, boost::normal_distribution random_distribution(a, sigma); boost::variate_generator > variate_generator(caffe_rng(), random_distribution); + //variate_generator(37, random_distribution); for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } + //LOG(INFO) << "caffe_rng_guassian"; } template @@ -297,6 +450,7 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } + //LOG(INFO) << "caffe_rng_bernoulli"; } template @@ -304,50 +458,31 @@ void caffe_rng_bernoulli(const int n, const double p, int* r); template void caffe_rng_bernoulli(const int n, const float p, int* r); - -template -void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = static_cast(variate_generator()); - } +// +template <> +float caffe_cpu_dot(const int n, const float* x, const float* y) { + return cblas_sdot(n, x, 1, y, 1); } -template -void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); - -template -void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); - template <> -float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { - return cblas_sdot(n, x, incx, y, incy); +double caffe_cpu_dot(const int n, const double* x, const double* y) { + return cblas_ddot(n, x, 1, y, 1); } template <> -double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { - return cblas_ddot(n, x, incx, y, incy); +void caffe_gpu_dot(const int n, const float* x, const float* y, + float* out) { + //need to pass in scratchBuff + //AMDBLAS_CHECK(clAmdBlasSdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template -Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { - return caffe_cpu_strided_dot(n, x, 1, y, 1); +template <> +void caffe_gpu_dot(const int n, const double* x, const double* y, + double * out) { + //need to pass in scratchBuff + //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template -float caffe_cpu_dot(const int n, const float* x, const float* y); - -template -double caffe_cpu_dot(const int n, const double* x, const double* y); - template <> int caffe_cpu_hamming_distance(const int n, const float* x, const float* y) { @@ -380,6 +515,18 @@ double caffe_cpu_asum(const int n, const double* x) { return cblas_dasum(n, x, 1); } +template <> +void caffe_gpu_asum(const int n, const float* x, float* y) { +} + +template <> +void caffe_gpu_asum(const int n, const double* x, double* y) { +} + +INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs); + template <> void caffe_cpu_scale(const int n, const float alpha, const float *x, float* y) { @@ -394,4 +541,129 @@ void caffe_cpu_scale(const int n, const double alpha, const double *x, cblas_dscal(n, alpha, y, 1); } +template <> +void caffe_gpu_scale(const int n, const float alpha, const float *x, + float* y) { +} + +template <> +void caffe_gpu_scale(const int n, const double alpha, const double *x, + double* y) { +} + +template +void set_kernel(const int n, const Dtype alpha, Dtype* y) { +} + +template <> +void caffe_gpu_set(const int N, const float alpha, float* Y) { + if (alpha == 0) { + return; + } +} + +template <> +void caffe_gpu_set(const int N, const double alpha, double* Y) { + if (alpha == 0) { + return; + } +} + +template +void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { +} + +template <> +void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { +} + +template <> +void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { +} + +template +void mul_kernel(const int n, const Dtype* a, + const Dtype* b, Dtype* y) { +} + +template <> +void caffe_gpu_mul(const int N, const float* a, + const float* b, float* y) { +} + +template <> +void caffe_gpu_mul(const int N, const double* a, + const double* b, double* y) { +} + +template +void div_kernel(const int n, const Dtype* a, + const Dtype* b, Dtype* y) { +} + +template <> +void caffe_gpu_div(const int N, const float* a, + const float* b, float* y) { +} + +template <> +void caffe_gpu_div(const int N, const double* a, + const double* b, double* y) { +} + +template +void powx_kernel(const int n, const Dtype* a, + const Dtype alpha, Dtype* y) { +} + +template <> +void caffe_gpu_powx(const int N, const float* a, + const float alpha, float* y) { +} + +template <> +void caffe_gpu_powx(const int N, const double* a, + const double alpha, double* y) { +} + + +void popc_kernel(const int n, const float* a, + const float* b, uint8_t* y) { +} + +void popcll_kernel(const int n, const double* a, + const double* b, uint8_t* y) { +} + +template <> +uint32_t caffe_gpu_hamming_distance(const int n, const float* x, + const float* y) { +} + +template <> +uint32_t caffe_gpu_hamming_distance(const int n, const double* x, + const double* y) { +} + +void caffe_gpu_rng_uniform(const int n, unsigned int* r) { +} + +template <> +void caffe_gpu_rng_uniform(const int n, const float a, const float b, + float* r) { +} +template <> +void caffe_gpu_rng_uniform(const int n, const double a, const double b, + double* r) { +} + +template <> +void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, + float* r) { +} + +template <> +void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, + double* r) { +} } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp.protect b/src/caffe/util/math_functions.cpp.protect new file mode 100644 index 00000000..166b709a --- /dev/null +++ b/src/caffe/util/math_functions.cpp.protect @@ -0,0 +1,413 @@ +#include +#include + +#include +#include + +#include "caffe/common.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/rng.hpp" + + +namespace caffe { + +template<> +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); +} + +template<> +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); +} + +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_axpy(const int N, const float alpha, const float* X, + float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } + +template <> +void caffe_axpy(const int N, const double alpha, const double* X, + double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } + +template +void caffe_set(const int N, const Dtype alpha, Dtype* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template void caffe_set(const int N, const int alpha, int* Y); +template void caffe_set(const int N, const float alpha, float* Y); +template void caffe_set(const int N, const double alpha, double* Y); + +template <> +void caffe_add_scalar(const int N, const float alpha, float* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template <> +void caffe_add_scalar(const int N, const double alpha, double* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template +void caffe_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + // NOLINT_NEXT_LINE(caffe/alt_fn) + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); +#else + NO_GPU; +#endif + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } +} + +template void caffe_copy(const int N, const int* X, int* Y); +template void caffe_copy(const int N, const unsigned int* X, + unsigned int* Y); +template void caffe_copy(const int N, const float* X, float* Y); +template void caffe_copy(const int N, const double* X, double* Y); + +template <> +void caffe_scal(const int N, const float alpha, float *X) { + cblas_sscal(N, alpha, X, 1); +} + +template <> +void caffe_scal(const int N, const double alpha, double *X) { + cblas_dscal(N, alpha, X, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_add(const int n, const float* a, const float* b, + float* y) { + vsAdd(n, a, b, y); +} + +template <> +void caffe_add(const int n, const double* a, const double* b, + double* y) { + vdAdd(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const float* a, const float* b, + float* y) { + vsSub(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const double* a, const double* b, + double* y) { + vdSub(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const float* a, const float* b, + float* y) { + vsMul(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const double* a, const double* b, + double* y) { + vdMul(n, a, b, y); +} + +template <> +void caffe_div(const int n, const float* a, const float* b, + float* y) { + vsDiv(n, a, b, y); +} + +template <> +void caffe_div(const int n, const double* a, const double* b, + double* y) { + vdDiv(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const float* a, const float b, + float* y) { + vsPowx(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const double* a, const double b, + double* y) { + vdPowx(n, a, b, y); +} + +template <> +void caffe_sqr(const int n, const float* a, float* y) { + vsSqr(n, a, y); +} + +template <> +void caffe_sqr(const int n, const double* a, double* y) { + vdSqr(n, a, y); +} + +template <> +void caffe_exp(const int n, const float* a, float* y) { + vsExp(n, a, y); +} + +template <> +void caffe_exp(const int n, const double* a, double* y) { + vdExp(n, a, y); +} + +template <> +void caffe_log(const int n, const float* a, float* y) { + vsLn(n, a, y); +} + +template <> +void caffe_log(const int n, const double* a, double* y) { + vdLn(n, a, y); +} + +template <> +void caffe_abs(const int n, const float* a, float* y) { + vsAbs(n, a, y); +} + +template <> +void caffe_abs(const int n, const double* a, double* y) { + vdAbs(n, a, y); +} + +unsigned int caffe_rng_rand() { + return (*caffe_rng())(); +} + +template +Dtype caffe_nextafter(const Dtype b) { + return boost::math::nextafter( + b, std::numeric_limits::max()); +} + +template +float caffe_nextafter(const float b); + +template +double caffe_nextafter(const double b); + +template +void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_LE(a, b); + boost::uniform_real random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_uniform(const int n, const float a, const float b, + float* r); + +template +void caffe_rng_uniform(const int n, const double a, const double b, + double* r); + +template +void caffe_rng_gaussian(const int n, const Dtype a, + const Dtype sigma, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GT(sigma, 0); + boost::normal_distribution random_distribution(a, sigma); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_gaussian(const int n, const float mu, + const float sigma, float* r); + +template +void caffe_rng_gaussian(const int n, const double mu, + const double sigma, double* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_bernoulli(const int n, const double p, int* r); + +template +void caffe_rng_bernoulli(const int n, const float p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } +} + +template +void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); + +template +void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); + +template <> +float caffe_cpu_strided_dot(const int n, const float* x, const int incx, + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); +} + +template <> +double caffe_cpu_strided_dot(const int n, const double* x, + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); +} + +template +Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { + return caffe_cpu_strided_dot(n, x, 1, y, 1); +} + +template +float caffe_cpu_dot(const int n, const float* x, const float* y); + +template +double caffe_cpu_dot(const int n, const double* x, const double* y); + +template <> +int caffe_cpu_hamming_distance(const int n, const float* x, + const float* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcount(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; +} + +template <> +int caffe_cpu_hamming_distance(const int n, const double* x, + const double* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcountl(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; +} + +template <> +float caffe_cpu_asum(const int n, const float* x) { + return cblas_sasum(n, x, 1); +} + +template <> +double caffe_cpu_asum(const int n, const double* x) { + return cblas_dasum(n, x, 1); +} + +template <> +void caffe_cpu_scale(const int n, const float alpha, const float *x, + float* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); +} + +template <> +void caffe_cpu_scale(const int n, const double alpha, const double *x, + double* y) { + cblas_dcopy(n, x, 1, y, 1); + cblas_dscal(n, alpha, y, 1); +} + +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} + +} // namespace caffe From 3965d0c242d9754e594054ffe784996ca08a51cd Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 11 Jul 2015 14:14:07 +0800 Subject: [PATCH 002/124] Synced memory changes --- include/caffe/blob.hpp | 1 + include/caffe/syncedmem.hpp | 18 +- src/caffe/OCL_kernel.cl | 1416 ++++++++++++++++++++++++++ src/caffe/blob.cpp | 6 + src/caffe/common.cpp | 5 + src/caffe/layers/base_data_layer.cpp | 20 + src/caffe/syncedmem.cpp | 88 +- 7 files changed, 1531 insertions(+), 23 deletions(-) create mode 100644 src/caffe/OCL_kernel.cl diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 472cc184..160539aa 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -220,6 +220,7 @@ class Blob { const Dtype* cpu_data() const; void set_cpu_data(Dtype* data); const Dtype* gpu_data() const; + const Dtype* gpu_cache_data() const; const Dtype* cpu_diff() const; const Dtype* gpu_diff() const; Dtype* mutable_cpu_data(); diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 1b726de9..0bcad1dc 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -42,29 +42,41 @@ class SyncedMemory { public: SyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false) {} + own_cpu_data_(false), is_data_layer_(false) { + ocl_setup(); + } explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false) {} + own_cpu_data_(false), data_layer_(false) { + ocl_setup(); + } + ~SyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); + //const void* gpu_cache_data(); void* mutable_cpu_data(); void* mutable_gpu_data(); enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; SyncedHead head() { return head_; } size_t size() { return size_; } + void set_data_layer(){ data_layer_ = true; } + private: + void ocl_setup(); + protected: + cl_kernel oclmem_kernel; private: void to_cpu(); void to_gpu(); void* cpu_ptr_; void* gpu_ptr_; + void* gpu_cache_ptr_; size_t size_; SyncedHead head_; bool own_cpu_data_; - + bool data_layer_; DISABLE_COPY_AND_ASSIGN(SyncedMemory); }; // class SyncedMemory diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl new file mode 100644 index 00000000..980dc37c --- /dev/null +++ b/src/caffe/OCL_kernel.cl @@ -0,0 +1,1416 @@ +#pragma OPENCL EXTENSION cl_amd_printf : enable + +//beginning of the looooooong gpu_random_generator kernel +//we use the open sourced threefry's GPU implementation +typedef uint uint32_t; + +struct r123array4x32 { uint32_t v[4]; }; + +enum r123_enum_threefry32x4 +{ + R_32x4_0_0 = 10, R_32x4_0_1 = 26, + R_32x4_1_0 = 11, R_32x4_1_1 = 21, + R_32x4_2_0 = 13, R_32x4_2_1 = 27, + R_32x4_3_0 = 23, R_32x4_3_1 = 5, + R_32x4_4_0 = 6, R_32x4_4_1 = 20, + R_32x4_5_0 = 17, R_32x4_5_1 = 11, + R_32x4_6_0 = 25, R_32x4_6_1 = 10, + R_32x4_7_0 = 18, R_32x4_7_1 = 20 +}; + +inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); +inline uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32 - N) & 31)); +} + +typedef struct r123array4x32 threefry4x32_ctr_t; +typedef struct r123array4x32 threefry4x32_key_t; +typedef struct r123array4x32 threefry4x32_ukey_t; + +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) +{ + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; + ks[4] = 0x1BD11BDA; + /* + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ + { + ks[0] = k.v[0]; + X.v[0] = in.v[0]; + ks[4] ^= k.v[0]; + + ks[1] = k.v[1]; + X.v[1] = in.v[1]; + ks[4] ^= k.v[1]; + + ks[2] = k.v[2]; + X.v[2] = in.v[2]; + ks[4] ^= k.v[2]; + + ks[3] = k.v[3]; + X.v[3] = in.v[3]; + ks[4] ^= k.v[3]; + } + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + if (Nrounds > 0) + { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 1) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 2) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 3) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 3) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 1; + } if (Nrounds > 4) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 5) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 6) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 7) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 7) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 2; + } if (Nrounds > 8) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 9) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 10) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 11) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 11) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 3; + } if (Nrounds > 12) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 13) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 14) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 15) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 15) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 4; + } if (Nrounds > 16) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 17) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 18) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 19) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 19) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 5; + } if (Nrounds > 20) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 21) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 22) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 23) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 23) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 6; + } if (Nrounds > 24) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 25) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 26) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 27) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 27) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 7; + } if (Nrounds > 28) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 29) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 30) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 31) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 31) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 8; + } if (Nrounds > 32) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 33) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 34) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 35) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 35) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 9; + } if (Nrounds > 36) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 37) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 38) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 39) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 39) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 10; + } if (Nrounds > 40) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 41) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 42) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 43) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 43) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 11; + } if (Nrounds > 44) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 45) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 46) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 47) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 47) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 12; + } if (Nrounds > 48) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 49) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 50) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 51) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 51) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 13; + } if (Nrounds > 52) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 53) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 54) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 55) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 55) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 14; + } if (Nrounds > 56) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 57) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 58) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 59) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 59) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 15; + } if (Nrounds > 60) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 61) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 62) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 63) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 63) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 16; + } if (Nrounds > 64) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 65) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 66) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 67) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 67) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 17; + } if (Nrounds > 68) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 69) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 70) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 71) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 71) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 18; + } + return X; +} + +template +__kernel void PRNG_threefry4x32( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom +){ + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } +} + + +template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); + +//end of the looooooong gpu_random_generator kernel + + +template +__kernel void OCL_memset(__global T* buffer, const T value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); +template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); + +__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template +__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ + int index=get_global_id(0); + data_im = data_im + img_offset; + data_col = data_col + col_offset; + if(index < n){ + int w_out=index %width_col; + index /= width_col; + int h_out=index%height_col; + int channel_in = index/height_col; + int channel_out=channel_in *ksize *ksize; + int h_in = h_out *stride-pad; + int w_in = w_out *stride-pad; + data_col +=(channel_out *height_col + h_out) *width_col + w_out; + data_im +=(channel_in * height + h_in) *width + w_in; + int i=0,j=0; + for(i=0;i= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col +=height_col *width_col; + } + } + } +} + +template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); + +template +__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){ + + int index = get_global_id(0); + + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int x_out = index % width_col; + int y_out = (index / width_col) % height_col; + int channel_in = (index / width_col / height_col) % channels; + int channel_out = channel_in * ksize * ksize; + int im_id = index / width_col / height_col / channels; + + int y_in = y_out * stride - pad; + int x_in = x_out * stride - pad; + int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; + int offset_im = im_id * channels * height * width + channel_in * height * width; + + for(int k_h = 0; k_h < ksize; k_h++){ + for(int k_w = 0; k_w < ksize; k_w++){ + int x_im = x_in + k_w; + int y_im = y_in + k_h; + int index_im = y_im * width + x_im; + int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) + data_col[offset_col + index_col] = data_im[offset_im + index_im]; + else + data_col[offset_col + index_col] = 0; + } + } +} + +template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); + + +template +__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){ + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); + +template +__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index;index= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col += height_col *width_col; + } + } + } +} + +template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); +template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); + +template +__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){ + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col * optnum); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); + + +template +__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < n; index += tmp){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); +template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); + +template +__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){ + + int index = get_global_id(0); + data_opt = data_opt + opt_offset; + data_im = data_im + im_offset; + if(index < n){ + int w = index % width; + int h = (index / width) % height; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + + int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; + data_opt[opt_index] = data_im[index]; + } +} +template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); + + +template +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global T* top_data){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp){ + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride; + int hend = min(hstart + kernel_size, height); + int wstart = pw * stride; + int wend = min(wstart + kernel_size, width); + T maxval = -99999999; + bottom_data += (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + maxval = max(maxval, bottom_data[h * width + w]); + } + } + top_data[index] = maxval; + } + +} +template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* top_data); +template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* top_data); + + +template +__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* top_data){ + int index=get_global_id(0); + int tmp=get_global_size(0); + for(index;index +__kernel void MaxPoolBackward(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* top_diff, +const int num, const int channels, const int height, +const int width, const int pooled_height, const int pooled_width, +const int kernel_size, const int stride, __global T* bottom_diff){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + // find out the local index + // find out the local offset + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1; + int phend = min(h / stride + 1, pooled_height); + int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1; + int pwend = min(w / stride + 1, pooled_width); + T gradient = 0; + T bottom_datum = + bottom_data[((n * channels + c) * height + h) * width + w]; + top_data += (n * channels + c) * pooled_height * pooled_width; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (bottom_datum == top_data[ph * pooled_width + pw]); + } + } + bottom_diff[index] = gradient; + + } + +} +template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* bottom_diff); + + +template +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* bottom_diff){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1; + int phend = min(h / stride + 1, pooled_height); + int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1; + int pwend = min(w / stride + 1, pooled_width); + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride - pad; + int wstart = pw * stride - pad; + int hend = min(hstart + kernel_size, height + pad); + int wend = min(wstart + kernel_size, width + pad); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } + } + bottom_diff[index] = gradient; + + } +} + +template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* bottom_diff); +template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global double* bottom_diff); + +template +__kernel void ReLUForward(const int count, __global T* in, __global T* out){ + int index = get_global_id(0); + if(index < count) + out[index] = in[index] > 0? in[index]:0; +} + +//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out); +template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out); +template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out); + +template +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff){ + int index = get_global_id(0); + if(index < count) + out_diff[index] = in_diff[index] * (in_data[index] > 0); +} + +template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff); +template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff); + +template +__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); +template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); + +template +__kernel void exp (const int num, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); +} + +template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); +template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); + +template +__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){ + //printf("softmax_div\n"); + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num*dim; index += total){ + int n = index / dim; + data[index] /= scale[n]; + } +} + +template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); +template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data); + +template +__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){ + + int gid = get_global_id(0); + int size = get_global_size(0); + + resultScratch[gid] = 0.0; + for(int i = gid; i < num; i += size){ + resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gid < 128) + resultScratch[gid] += resultScratch[gid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + if(gid < 64) + resultScratch[gid] += resultScratch[gid + 64]; + if(gid < 32) + resultScratch[gid] += resultScratch[gid + 32]; + if(gid < 16) + resultScratch[gid] += resultScratch[gid + 16]; + if(gid < 8) + resultScratch[gid] += resultScratch[gid + 8]; + if(gid < 4) + resultScratch[gid] += resultScratch[gid + 4]; + if(gid < 2) + resultScratch[gid] += resultScratch[gid + 2]; + if(gid < 1){ + resultScratch[gid] += resultScratch[gid + 1]; + loss[0] = resultScratch[gid]; + } + +} + +template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); +template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); + + +template +__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total){ + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } +} + +template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); +template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); + +template +__kernel void scal (const int num, const T alpha, __global T* data){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total){ + data[index] = data[index] * alpha; + } +} + +template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); +template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); + +template +__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] / b[index]; +} + +template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); +//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); + +template +__kernel void add_scalar (const int n, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] += alpha; +} + +template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); +template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); + +template +__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; +} + +template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); +template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); + + +template +__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) +// y[index] = a[index] + alpha; + y[index] = pow(a[index], alpha); +} + +template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); +template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); + +template +__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){ + int index = get_global_id(0); + if (index < n) + out[index] = in[index] * scale * mask[index]; +} +template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); + + +template +__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){ + int index = get_global_id(0); + if (index < n) + out_diff[index] = in_diff[index] * scale * mask[index]; +} +template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); + +template +__kernel void LRNFillScale(const int nthreads, __global const T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, __global T* scale) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + // find out the local offset + int w = index % width; + int h = (index / width) % height; + int n = index / width / height; + int offset = (n * channels * height + h) * width + w; + int step = height * width; + in += offset; + scale += offset; + int head = 0; + int pre_pad = (size - 1) / 2; + int post_pad = size - pre_pad - 1; + T accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // until we reach size, nothing needs to be subtracted + while (head < size) { + accum_scale += in[head * step] * in[head * step]; + scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + accum_scale -= in[(head - size) * step] * in[(head - size) * step]; + scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; + ++head; + } + // subtract only + while (head < channels + post_pad) { + accum_scale -= in[(head - size) * step] * in[(head - size) * step]; + scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; + ++head; + } + } +} +template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global const float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, __global float* scale); +template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global const double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, __global double* scale); + +template +__kernel void LRNComputeOutput(const int nthreads, __global const T* in, __global const T* scale, const T negative_beta, __global T* out) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) + out[index] = in[index] * pow(scale[index], negative_beta); +} +template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global const float* in, __global const float* scale, const float negative_beta, __global float* out); +template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global const double* in, __global const double* scale, const double negative_beta, __global double* out); + +template +__kernel void LRNComputeDiff(const int nthreads, __global const T* bottom_data, __global const T* top_data, __global const T* scale, __global const T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + int w = index % width; + int h = (index / width) % height; + int n = index / width / height; + int offset = (n * channels * height + h) * width + w; + int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + int pre_pad = size - (size + 1) / 2; + int post_pad = size - pre_pad - 1; + T accum_ratio = 0; + // accumulate values + while (head < post_pad) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // until we reach size, nothing needs to be subtracted + while (head < size) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + } +} + +template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global const float* bottom_data, __global const float* top_data, __global const float* scale, __global const float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global const double* bottom_data, __global const double* top_data, __global const double* scale, __global const double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); + +template +__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; +} +template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); + +template +__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0 ; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; +} +template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 94fdcc35..4cec89ae 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -85,6 +85,12 @@ const Dtype* Blob::gpu_data() const { return (const Dtype*)data_->gpu_data(); } +template +const Dtype* Blob::gpu_cache_data() const { + CHECK(data_); + return (const Dtype*)data_->gpu_cache_data(); +} + template const Dtype* Blob::cpu_diff() const { CHECK(diff_); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index e53a5c0d..052281d4 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -11,6 +11,8 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { + //To fix: for now we use fixed seed to get same result each time + /* int64_t s, seed, pid; FILE* f = fopen("/dev/urandom", "rb"); if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { @@ -27,6 +29,9 @@ int64_t cluster_seedgen(void) { s = time(NULL); seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); return seed; + */ + LOG(WARNING) << "return fixed seed 37"; + return 37; } diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 26a11182..fa4fe30f 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -78,6 +78,26 @@ void BasePrefetchingDataLayer::Forward_cpu( DLOG(INFO) << "CreatePrefetchThread"; CreatePrefetchThread(); } +template +Dtype DataLayer::Forward_gpu(const vector*>& bottom, + vector*>* top) { + // First, join the thread + JoinPrefetchThread(); + // Copy the data from prefetch thread to data_layer + //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_->count(), prefetch_data_->cpu_data(), 0, NULL, NULL) ); + if (output_labels_) { + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_->count(), prefetch_label_->cpu_data(), 0, NULL, NULL) ); + //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); + } + clFinish(amdDevice.CommandQueue); +#ifdef Track_data_transfer +#endif + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); + return Dtype(0.); +} #ifdef CPU_ONLY STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 200ca657..ce11aa03 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -4,9 +4,23 @@ #include "caffe/syncedmem.hpp" #include "caffe/util/math_functions.hpp" +#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices + namespace caffe { SyncedMemory::~SyncedMemory() { +if (cpu_ptr_ && own_cpu_data_) { + OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL) ); + clFinish(amdDevice.CommandQueue); + } + if(gpu_cache_ptr_ && own_cpu_data_) { + OCL_CHECK( clReleaseMemObject((cl_mem)gpu_cache_ptr_) ); + } + if (gpu_ptr_) { + OCL_CHECK( clReleaseMemObject((cl_mem)gpu_ptr_) ); + } + + clReleaseKernel(oclmem_kernel); /* if (cpu_ptr_ && own_cpu_data_) { CaffeFreeHost(cpu_ptr_); } @@ -17,51 +31,84 @@ SyncedMemory::~SyncedMemory() { } #endif // CPU_ONLY */ +} + +void SyncedMemory::ocl_setup() { + cl_int err=0; + oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + OCL_CHECK(err); } inline void SyncedMemory::to_cpu() { - /* switch (head_) { +switch (head_) { case UNINITIALIZED: - CaffeMallocHost(&cpu_ptr_, size_); - caffe_memset(size_, 0, cpu_ptr_); + //allocate pre-pinned memory + //pinned_buffer_ptr_ + // if(data_layer_){ + // gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_PERSISTENT_MEM_AMD, size_, NULL, NULL); + // } + // else{ + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); + //} + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); + memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; - case HEAD_AT_GPU: + case HEAD_AT_GPU:{ #ifndef CPU_ONLY if (cpu_ptr_ == NULL) { - CaffeMallocHost(&cpu_ptr_, size_); + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); own_cpu_data_ = true; } - caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); + OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_ptr_, (cl_mem)gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); head_ = SYNCED; #else NO_GPU; +#endif +#ifdef Track_data_transfer + LOG(WARNING) << "sync: data from GPU to CPU"; #endif break; + } case HEAD_AT_CPU: case SYNCED: break; } -*/ } inline void SyncedMemory::to_gpu() { -/* #ifndef CPU_ONLY - switch (head_) { - case UNINITIALIZED: - CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); - caffe_gpu_memset(size_, 0, gpu_ptr_); +switch (head_) { + case UNINITIALIZED:{ + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL); + if(NULL == tmpMem){ + fprintf(stderr,"Failed to create memory object\n"); + break; + } + ocl_memset(oclmem_kernel, tmpMem, (int)0, (int)(size_/sizeof(int))); + gpu_ptr_ = (void*)tmpMem; head_ = HEAD_AT_GPU; break; - case HEAD_AT_CPU: + } + case HEAD_AT_CPU:{ if (gpu_ptr_ == NULL) { - CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL); + if(NULL == tmpMem){ + fprintf(stderr,"Failed to create memory object\n"); + } + gpu_ptr_ = (void*)tmpMem; } - caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); + OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, (cl_mem)gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); head_ = SYNCED; +#ifdef Track_data_transfer + LOG(WARNING) << "sync: data from CPU to GPU"; +#endif break; + } case HEAD_AT_GPU: case SYNCED: break; @@ -69,7 +116,6 @@ inline void SyncedMemory::to_gpu() { #else NO_GPU; #endif -*/ } const void* SyncedMemory::cpu_data() { @@ -78,14 +124,16 @@ const void* SyncedMemory::cpu_data() { } void SyncedMemory::set_cpu_data(void* data) { - /*CHECK(data); +CHECK(data); if (own_cpu_data_) { - CaffeFreeHost(cpu_ptr_); + OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL)); + OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_)); + clFinish(amdDevice.CommandQueue); //is this necessary? } - cpu_ptr_ = data; + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_HOST_PTR, size_, data, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); head_ = HEAD_AT_CPU; own_cpu_data_ = false; -*/ } const void* SyncedMemory::gpu_data() { From 8a7c2b25492901d0f7d4a45aac375184fa048c74 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 11 Jul 2015 14:45:32 +0800 Subject: [PATCH 003/124] update data layer for AMD_PERSISTENT_MEM --- include/caffe/blob.hpp | 1 + include/caffe/syncedmem.hpp | 2 +- src/caffe/layers/data_layer.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 160539aa..12854689 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -263,6 +263,7 @@ class Blob { * shared_ptr calls its destructor when reset with the "=" operator. */ void ShareDiff(const Blob& other); + void set_data_layer(){data_->set_data_layer(); diff_->set_data_layer();}; bool ShapeEquals(const BlobProto& other); diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 0bcad1dc..61336d7e 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -42,7 +42,7 @@ class SyncedMemory { public: SyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false), is_data_layer_(false) { + own_cpu_data_(false), data_layer_(false) { ocl_setup(); } explicit SyncedMemory(size_t size) diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 161a75e0..26eae788 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -48,6 +48,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, top_shape[0] = this->layer_param_.data_param().batch_size(); this->prefetch_data_.Reshape(top_shape); top[0]->ReshapeLike(this->prefetch_data_); + prefetch_data_->set_data_layer(); LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," @@ -57,6 +58,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, vector label_shape(1, this->layer_param_.data_param().batch_size()); top[1]->Reshape(label_shape); this->prefetch_label_.Reshape(label_shape); + prefetch_label_->set_data_layer(); } } From 13cd87f9f0b6366dde8fee8fb2d31648fed93872 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 12 Jul 2015 14:30:12 +0800 Subject: [PATCH 004/124] add Forward_gpu and Backward_gpu for more layers; update math functions, Makefile and im2col.cpp --- Makefile | 29 +++++ include/caffe/util/math_functions.hpp | 14 +-- src/caffe/layers/absval_layer.cpp | 10 ++ src/caffe/layers/base_data_layer.cpp | 7 ++ src/caffe/layers/bnll_layer.cpp | 10 ++ src/caffe/layers/concat_layer.cpp | 10 ++ src/caffe/layers/contrastive_loss_layer.cpp | 10 ++ src/caffe/layers/deconv_layer.cpp | 12 ++ src/caffe/layers/eltwise_layer.cpp | 12 ++ src/caffe/layers/euclidean_loss_layer.cpp | 12 ++ src/caffe/layers/exp_layer.cpp | 10 ++ src/caffe/layers/filter_layer.cpp | 11 ++ src/caffe/layers/hdf5_data_layer.cpp | 6 + src/caffe/layers/hdf5_output_layer.cpp | 11 ++ src/caffe/layers/im2col_layer.cpp | 10 ++ src/caffe/layers/inner_product_layer.cpp | 10 ++ src/caffe/layers/log_layer.cpp | 12 ++ src/caffe/layers/lrn_layer.cpp | 20 ++++ src/caffe/layers/mvn_layer.cpp | 10 ++ src/caffe/layers/power_layer.cpp | 10 ++ src/caffe/layers/prelu_layer.cpp | 11 ++ src/caffe/layers/reduction_layer.cpp | 10 ++ .../sigmoid_cross_entropy_loss_layer.cpp | 5 + src/caffe/layers/sigmoid_layer.cpp | 10 ++ src/caffe/layers/silence_layer.cpp | 11 ++ src/caffe/layers/slice_layer.cpp | 10 ++ src/caffe/layers/softmax_layer.cpp | 11 ++ src/caffe/layers/softmax_loss_layer.cpp | 12 ++ src/caffe/layers/split_layer.cpp | 9 ++ src/caffe/layers/tanh_layer.cpp | 12 ++ src/caffe/layers/threshold_layer.cpp | 5 + src/caffe/util/im2col.cpp | 39 ++++++ src/caffe/util/math_functions.cpp | 111 +++++++++++++++++- 33 files changed, 480 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 80c5642d..f0ac9e06 100644 --- a/Makefile +++ b/Makefile @@ -163,11 +163,40 @@ ifneq ("$(wildcard $(CUDA_DIR)/lib64)","") endif CUDA_LIB_DIR += $(CUDA_DIR)/lib +################################# +# OpenCL include and library +################################# +OCL_INCLUDE_DIR := $(OCL_DIR)/include +CLBLAS_INCLUDE_DIR := ${CLBLAS_DIR}/include + +OCL_LIB_DIR := +CLBLAS_LIB_DIR := +# add /lib/x86_64 only if it exists +ifneq ("$(wildcard $(OCL_LIB_DIR)/lib/x86_64)","") + OCL_LIB_DIR += $(OCL_DIR)/lib/x86_64 +endif +OCL_LIB_DIR += $(OCL_DIR)/lib/x86 + +# add /lib/ only if it exists +ifneq ("$(wildcard $(CLBLAS_DIR)/lib)","") + CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib +endif + +# add /lib64/ only if it exists +ifneq ("$(wildcard $(CLBLAS_DIR)/lib64)","") + CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib64 +endif + INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include ifneq ($(CPU_ONLY), 1) INCLUDE_DIRS += $(CUDA_INCLUDE_DIR) LIBRARY_DIRS += $(CUDA_LIB_DIR) LIBRARIES := cudart cublas curand + + INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR) + LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR) + LIBRARIES += OpenCL clBLAS + endif LIBRARIES += glog gflags protobuf leveldb snappy \ lmdb boost_system hdf5_hl hdf5 m \ diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index bcafeb89..2cbbf1f0 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -241,27 +241,19 @@ inline char caffe_sign(Dtype val) { template <> \ void caffe_cpu_##name(const int n, const double* x, double* y) -/* + #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ template \ -__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ - CUDA_KERNEL_LOOP(index, n) { \ +void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ operation; \ - } \ } \ template <> \ void caffe_gpu_##name(const int n, const float* x, float* y) { \ - NOLINT_NEXT_LINE(whitespace/operators) \ - name##_kernel<<>>( \ - n, x, y); \ } \ template <> \ void caffe_gpu_##name(const int n, const double* x, double* y) { \ - NOLINT_NEXT_LINE(whitespace/operators) \ - name##_kernel<<>>( \ - n, x, y); \ } -*/ + // output is 1 for the positives, 0 for zero, and -1 for the negatives DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 5ce28c9e..30422737 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -35,6 +35,16 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } } +template +void AbsValLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void AbsValLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(AbsValLayer); #endif diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index fa4fe30f..917059b8 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -99,6 +99,13 @@ Dtype DataLayer::Forward_gpu(const vector*>& bottom, return Dtype(0.); } +template +void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + + + #ifdef CPU_ONLY STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); #endif diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 9ba0ea9a..09e2bc89 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -38,6 +38,16 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } } +template +void BNLLLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void BNLLLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(BNLLLayer); #endif diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 1cac8fc3..6af287a9 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -88,6 +88,16 @@ void ConcatLayer::Backward_cpu(const vector*>& top, } } +template +void ConcatLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ConcatLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ConcatLayer); #endif diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 25e16781..aad4cab3 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -111,6 +111,16 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } } +template +void ContrastiveLossLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ContrastiveLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ContrastiveLossLayer); #endif diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index a4612963..e8937238 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -69,6 +69,18 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, } } +template +void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void DeconvolutionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(DeconvolutionLayer); #endif diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index a8070073..cffc743d 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -151,6 +151,18 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } } +template +void EltwiseLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void EltwiseLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(EltwiseLayer); #endif diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 80efa31b..9c37c18b 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -47,6 +47,18 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, } } +template +void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void EuclideanLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(EuclideanLossLayer); #endif diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index c7e7c60c..547fca6a 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -59,6 +59,16 @@ void ExpLayer::Backward_cpu(const vector*>& top, } } +template +void ExpLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ExpLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ExpLayer); #endif diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index be1db32d..4d004ad4 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -117,6 +117,17 @@ void FilterLayer::Backward_cpu(const vector*>& top, } } +template +void FilterLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void FilterLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + #ifdef CPU_ONLY STUB_GPU(FilterLayer); #endif diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 8a782f7e..649dc020 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -157,6 +157,12 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, } } +template +void HDF5DataLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + + #ifdef CPU_ONLY STUB_GPU_FORWARD(HDF5DataLayer, Forward); #endif diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index f63375c3..7d1ca097 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -67,6 +67,17 @@ void HDF5OutputLayer::Backward_cpu(const vector*>& top, return; } +template +void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void HDF5OutputLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + return; +} + #ifdef CPU_ONLY STUB_GPU(HDF5OutputLayer); #endif diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 1c802714..ddf6c989 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -85,6 +85,16 @@ void Im2colLayer::Backward_cpu(const vector*>& top, } } +template +void Im2colLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void Im2colLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(Im2colLayer); #endif diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 83c3235e..4d25215a 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -119,6 +119,16 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, } } +template +void InnerProductLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void InnerProductLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(InnerProductLayer); #endif diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 55a227f6..9d3977a7 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -77,6 +77,18 @@ void LogLayer::Backward_cpu(const vector*>& top, caffe_mul(count, top_diff, bottom_diff, bottom_diff); } +template +void LogLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void LogLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(LogLayer); #endif diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 36c1ace4..47fa5ed5 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -247,6 +247,26 @@ void LRNLayer::WithinChannelBackward( } } +template +void LRNLayer::CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + +template +void LRNLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void LRNLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(LRNLayer); STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 3e79bddc..84701831 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -134,6 +134,16 @@ void MVNLayer::Backward_cpu(const vector*>& top, } } +template +void MVNLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void MVNLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(MVNLayer); diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 4fe34c49..bc14fffb 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -94,6 +94,16 @@ void PowerLayer::Backward_cpu(const vector*>& top, } } +template +void PowerLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void PowerLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(PowerLayer); #endif diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 81831755..4db0dc7c 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -129,6 +129,17 @@ void PReLULayer::Backward_cpu(const vector*>& top, } } +template +void PReLULayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void PReLULayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + #ifdef CPU_ONLY STUB_GPU(PReLULayer); diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 8ae6329e..c4a8b4e0 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -122,6 +122,16 @@ void ReductionLayer::Backward_cpu(const vector*>& top, } } +template +void ReductionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void ReductionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(ReductionLayer); #endif diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index cc236fe1..1a4329da 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -70,6 +70,11 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( } } +template +void SigmoidCrossEntropyLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward); #endif diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 48c38490..30ad9b0b 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -39,6 +39,16 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, } } +template +void SigmoidLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void SigmoidLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(SigmoidLayer); #endif diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index 4abf9eff..ecd12d12 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -17,6 +17,17 @@ void SilenceLayer::Backward_cpu(const vector*>& top, } } +template +void SilenceLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // Do nothing. +} + +template +void SilenceLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(SilenceLayer); #endif diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index e4418c9c..76021faa 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -110,6 +110,16 @@ void SliceLayer::Backward_cpu(const vector*>& top, } } +template +void SliceLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void SliceLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + #ifdef CPU_ONLY STUB_GPU(SliceLayer); #endif diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 04712c9e..488e836a 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -86,6 +86,17 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } +template +void SoftmaxLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void SoftmaxLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + #ifdef CPU_ONLY STUB_GPU(SoftmaxLayer); diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index ba312f67..6380f264 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -120,6 +120,18 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } +template +void SoftmaxWithLossLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(SoftmaxWithLossLayer); #endif diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 272cb59c..932b240b 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -49,6 +49,15 @@ void SplitLayer::Backward_cpu(const vector*>& top, } } +template +void SplitLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void SplitLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} #ifdef CPU_ONLY STUB_GPU(SplitLayer); diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index ee5ed773..abc09bbc 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -37,6 +37,18 @@ void TanHLayer::Backward_cpu(const vector*>& top, } } +template +void TanHLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + +template +void TanHLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom){ +} + + + #ifdef CPU_ONLY STUB_GPU(TanHLayer); #endif diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index 2365e7b9..345fd6b7 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -24,6 +24,11 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, } } +template +void ThresholdLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top){ +} + #ifdef CPU_ONLY STUB_GPU_FORWARD(ThresholdLayer, Forward); #endif diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index c48f31f3..6545d98c 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -80,4 +80,43 @@ template void col2im_cpu(const double* data_col, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im); + + +template +void im2col_gpu(const Dtype* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { +} + + +// Explicit instantiation +template void im2col_gpu(const float* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); +template void im2col_gpu(const double* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); + + +template +void col2im_gpu(const Dtype* data_col, const int channels, + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im) { +} + +// Explicit instantiation +template void col2im_gpu(const float* data_col, const int channels, + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); +template void col2im_gpu(const double* data_col, const int channels, + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); + } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 6cbf208d..364fbe11 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -450,7 +450,6 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } - //LOG(INFO) << "caffe_rng_bernoulli"; } template @@ -458,6 +457,26 @@ void caffe_rng_bernoulli(const int n, const double p, int* r); template void caffe_rng_bernoulli(const int n, const float p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } +} + +template +void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); + +template +void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); // template <> float caffe_cpu_dot(const int n, const float* x, const float* y) { @@ -523,6 +542,10 @@ template <> void caffe_gpu_asum(const int n, const double* x, double* y) { } +DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) + - (x[index] < Dtype(0))); +DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); + INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign); INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit); INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs); @@ -666,4 +689,90 @@ template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) { } + +template <> +void caffe_log(const int n, const float* a, float* y) { + vsLn(n, a, y); +} + +template <> +void caffe_log(const int n, const double* a, double* y) { + vdLn(n, a, y); +} + +template +void caffe_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + // NOLINT_NEXT_LINE(caffe/alt_fn) + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); +#else + NO_GPU; +#endif + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } +} + +template void caffe_copy(const int N, const int* X, int* Y); +template void caffe_copy(const int N, const unsigned int* X, + unsigned int* Y); +template void caffe_copy(const int N, const float* X, float* Y); +template void caffe_copy(const int N, const double* X, double* Y); + +template <> +void caffe_abs(const int n, const float* a, float* y) { + vsAbs(n, a, y); +} + +template <> +void caffe_abs(const int n, const double* a, double* y) { + vdAbs(n, a, y); +} + +template <> +void caffe_gpu_add(const int N, const float* a, const float* b, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + // add_kernel<<>>( + // N, a, b, y); +} + +template <> +void caffe_gpu_add(const int N, const double* a, const double* b, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + // add_kernel<<>>( + // N, a, b, y); +} + +template <> +float caffe_cpu_strided_dot(const int n, const float* x, const int incx, + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); +} + +template <> +double caffe_cpu_strided_dot(const int n, const double* x, + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); +} + +template +void caffe_set(const int N, const Dtype alpha, Dtype* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template void caffe_set(const int N, const int alpha, int* Y); +template void caffe_set(const int N, const float alpha, float* Y); +template void caffe_set(const int N, const double alpha, double* Y); + } // namespace caffe From 8e0713542041d908a1dfda85e2aa95b07532f162 Mon Sep 17 00:00:00 2001 From: Junli Date: Sun, 12 Jul 2015 15:05:21 +0800 Subject: [PATCH 005/124] Minor update ot syncedmem.hpp --- include/caffe/syncedmem.hpp | 2 +- src/caffe/util/math_functions.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 61336d7e..2cb316fb 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -55,7 +55,7 @@ class SyncedMemory { const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); - //const void* gpu_cache_data(); + const void* gpu_cache_data(); void* mutable_cpu_data(); void* mutable_gpu_data(); enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 364fbe11..17c2b414 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -775,4 +775,5 @@ template void caffe_set(const int N, const int alpha, int* Y); template void caffe_set(const int N, const float alpha, float* Y); template void caffe_set(const int N, const double alpha, double* Y); + } // namespace caffe From 622a9bced3a5418864c5348e7c8cc80a24746519 Mon Sep 17 00:00:00 2001 From: Yibing Date: Mon, 13 Jul 2015 12:18:39 +0800 Subject: [PATCH 006/124] This patch debugged data layer, added ocl/util, etc. made run cpu alexnet --- examples/imagenet/train_alexnet.sh | 4 ++ examples/imagenet/train_alexnet_cpu.sh | 4 ++ examples/imagenet/train_caffenet_cpu.sh | 4 ++ include/caffe/util/ocl_util.hpp | 16 ++++++ models/bvlc_alexnet/solver.prototxt | 4 +- models/bvlc_alexnet/train_val.prototxt | 8 +-- src/caffe/device.cpp | 2 +- src/caffe/layers/base_data_layer.cpp | 44 +++++++++++++--- src/caffe/layers/data_layer.cpp | 4 +- src/caffe/solver.cpp | 9 ++++ src/caffe/syncedmem.cpp | 5 +- src/caffe/util/ocl_util.cpp | 68 +++++++++++++++++++++++++ 12 files changed, 154 insertions(+), 18 deletions(-) create mode 100755 examples/imagenet/train_alexnet.sh create mode 100755 examples/imagenet/train_alexnet_cpu.sh create mode 100755 examples/imagenet/train_caffenet_cpu.sh create mode 100644 include/caffe/util/ocl_util.hpp create mode 100644 src/caffe/util/ocl_util.cpp diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh new file mode 100755 index 00000000..98c05c59 --- /dev/null +++ b/examples/imagenet/train_alexnet.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver.prototxt diff --git a/examples/imagenet/train_alexnet_cpu.sh b/examples/imagenet/train_alexnet_cpu.sh new file mode 100755 index 00000000..a86f75fe --- /dev/null +++ b/examples/imagenet/train_alexnet_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_cpu.prototxt diff --git a/examples/imagenet/train_caffenet_cpu.sh b/examples/imagenet/train_caffenet_cpu.sh new file mode 100755 index 00000000..4bcebf36 --- /dev/null +++ b/examples/imagenet/train_caffenet_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +./build/tools/caffe train \ + --solver=models/bvlc_reference_caffenet/solver_cpu.prototxt diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp new file mode 100644 index 00000000..55695070 --- /dev/null +++ b/include/caffe/util/ocl_util.hpp @@ -0,0 +1,16 @@ +// Copyright 2014 AMD DNN contributors. + +#ifndef _CAFFE_UTIL_OCL_UTIL_HPP_ +#define _CAFFE_UTIL_OCL_UTIL_HPP_ + +namespace caffe { + +template +void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count); + +void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count); + +void eventCallback(cl_event event, cl_int event_status, void * user_data); +} // namespace caffe + +#endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/models/bvlc_alexnet/solver.prototxt b/models/bvlc_alexnet/solver.prototxt index 129265e6..6f23e9d1 100644 --- a/models/bvlc_alexnet/solver.prototxt +++ b/models/bvlc_alexnet/solver.prototxt @@ -1,11 +1,11 @@ net: "models/bvlc_alexnet/train_val.prototxt" -test_iter: 1000 +test_iter: 1 test_interval: 1000 base_lr: 0.01 lr_policy: "step" gamma: 0.1 stepsize: 100000 -display: 20 +display: 1 max_iter: 450000 momentum: 0.9 weight_decay: 0.0005 diff --git a/models/bvlc_alexnet/train_val.prototxt b/models/bvlc_alexnet/train_val.prototxt index 588b4ea7..1f9654be 100644 --- a/models/bvlc_alexnet/train_val.prototxt +++ b/models/bvlc_alexnet/train_val.prototxt @@ -10,10 +10,10 @@ layer { transform_param { mirror: true crop_size: 227 - mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" } data_param { - source: "examples/imagenet/ilsvrc12_train_lmdb" + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" batch_size: 256 backend: LMDB } @@ -29,10 +29,10 @@ layer { transform_param { mirror: false crop_size: 227 - mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" } data_param { - source: "examples/imagenet/ilsvrc12_val_lmdb" + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" batch_size: 50 backend: LMDB } diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 7c564589..bce26316 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -127,7 +127,7 @@ cl_int Device::Init(){ } //Read our own kernel file - const char *pFileName = "../../src/caffe/OCL_kernel.cl"; + const char *pFileName = "./src/caffe/OCL_kernel.cl"; const char *pSource; std::string strSource = ""; ConvertToString(pFileName, strSource); diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 917059b8..7169d3fd 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -74,35 +74,63 @@ void BasePrefetchingDataLayer::Forward_cpu( caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), top[1]->mutable_cpu_data()); } + + //sample <=20 data from top_data and display + const Dtype *top_cpu_data = (top)[0]->cpu_data(); + size_t top_cpu_data_count = (top)[0]->count(); + size_t sample_interval = top_cpu_data_count/20; + if(sample_interval == 0){ + sample_interval=1; + } + for(int i=0; i -Dtype DataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { +void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { // First, join the thread JoinPrefetchThread(); // Copy the data from prefetch thread to data_layer //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_->count(), prefetch_data_->cpu_data(), 0, NULL, NULL) ); - if (output_labels_) { - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_->count(), prefetch_label_->cpu_data(), 0, NULL, NULL) ); + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); + if (this->output_labels_) { + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); } clFinish(amdDevice.CommandQueue); #ifdef Track_data_transfer #endif + +//sample <=20 data from top_data and display + const Dtype *top_cpu_data = (top)[0]->cpu_data(); + size_t top_cpu_data_count = (top)[0]->count(); + size_t sample_interval = top_cpu_data_count/20; + if(sample_interval == 0){ + sample_interval=1; + } + for(int i=0; i +/*template void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ -} +}*/ diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 26eae788..8ac9b8ee 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -48,7 +48,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, top_shape[0] = this->layer_param_.data_param().batch_size(); this->prefetch_data_.Reshape(top_shape); top[0]->ReshapeLike(this->prefetch_data_); - prefetch_data_->set_data_layer(); + this->prefetch_data_.set_data_layer(); LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," @@ -58,7 +58,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, vector label_shape(1, this->layer_param_.data_param().batch_size()); top[1]->Reshape(label_shape); this->prefetch_label_.Reshape(label_shape); - prefetch_label_->set_data_layer(); + this->prefetch_label_.set_data_layer(); } } diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index aabe0ede..bbac8fb5 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -33,6 +33,14 @@ void Solver::Init(const SolverParameter& param) { << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; + +//#ifndef CPU_ONLY + //AMD device related initialization + amdDevice.Init(); +//#else +// NO_GPU; +//#endif + if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } @@ -42,6 +50,7 @@ void Solver::Init(const SolverParameter& param) { LOG(INFO) << "Solver scaffolding done."; iter_ = 0; current_step_ = 0; + } template diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index ce11aa03..e98e6847 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -3,6 +3,7 @@ #include "caffe/common.hpp" #include "caffe/syncedmem.hpp" #include "caffe/util/math_functions.hpp" +#include "caffe/util/ocl_util.hpp" #define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices @@ -161,6 +162,8 @@ void* SyncedMemory::mutable_gpu_data() { #endif } - +const void *SyncedMemory::gpu_cache_data() +{ +} } // namespace caffe diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp new file mode 100644 index 00000000..8feead82 --- /dev/null +++ b/src/caffe/util/ocl_util.cpp @@ -0,0 +1,68 @@ +// Copyright 2014 AMD DNN contributors. + +#include +#include +#include +#include +#include +#include "caffe/common.hpp" +#include "caffe/util/ocl_util.hpp" +namespace caffe { + + +template +void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){ + cl_int err=0; + //cl_kernel Kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", &err); + //if(NULL==Kernel){ + // fprintf(stderr, "Failed to create kernel %d\n", err); + //} + + err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); + err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value); + err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); + OCL_CHECK(err); + + size_t Global_Work_Size[1] = {count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} + +// Explicit instantiation +template void ocl_memset(cl_kernel Kernel, float* buffer, const float value, const int count); +template void ocl_memset(cl_kernel Kernel, double* buffer, const double value, const int count); + + +void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){ + cl_int err=0; + // cl_kernel Kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + // if(NULL==Kernel){ + // fprintf(stderr, "Failed to create kernel %d\n", err); + // } + + err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); + err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value); + err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); + OCL_CHECK(err); + + size_t Global_Work_Size[] = {count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} + +void eventCallback(cl_event event, cl_int event_status, void* user_data){ + printf("The calling\n"); + int err = 0; + cl_ulong ev_start_time = (cl_ulong)0; + cl_ulong ev_end_time = (cl_ulong)0; + double run_time; + err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL); + err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL); + run_time = (double)(ev_end_time - ev_start_time); + printf("The kernel's running time is %f s\n", run_time * 1.0e-9); +} + + +} // namespace caffe From 1a45bf189b4555a3ab2246e5bd07dd7f1445a018 Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 16 Jul 2015 02:53:25 +0800 Subject: [PATCH 007/124] Conv layer FP and BP logic ported. Baseline scheme --- examples/imagenet/train_alexnet.sh | 2 +- .../imagenet/train_alexnet_without_dropout.sh | 4 + .../train_alexnet_without_dropout_cpu.sh | 4 + include/caffe/common.hpp | 17 + include/caffe/util/im2col.hpp | 35 + include/caffe/util/ocl_wrapper.hpp | 67 ++ include/caffe/vision_layers.hpp | 29 +- src/caffe/OCL_kernel.cl | 11 + src/caffe/common.cpp | 15 +- src/caffe/data_transformer.cpp | 2 + src/caffe/layers/base_conv_layer.cpp | 107 ++- src/caffe/layers/base_data_layer.cpp | 32 +- src/caffe/layers/conv_layer.cpp | 67 +- src/caffe/layers/dropout_layer.cpp | 2 + src/caffe/layers/inner_product_layer.cpp | 2 + src/caffe/layers/lrn_layer.cpp | 4 + src/caffe/layers/pooling_layer.cpp | 2 + src/caffe/layers/relu_layer.cpp | 2 + src/caffe/layers/softmax_layer.cpp | 2 + src/caffe/layers/softmax_loss_layer.cpp | 2 + src/caffe/layers/split_layer.cpp | 2 + src/caffe/net.cpp | 852 ------------------ src/caffe/solver.cpp | 29 +- src/caffe/util/benchmark.cpp | 8 +- src/caffe/util/im2col.cpp | 246 ++++- src/caffe/util/math_functions.cpp | 18 +- src/caffe/util/ocl_wrapper.cpp | 447 +++++++++ 27 files changed, 1102 insertions(+), 908 deletions(-) create mode 100755 examples/imagenet/train_alexnet_without_dropout.sh create mode 100755 examples/imagenet/train_alexnet_without_dropout_cpu.sh create mode 100644 include/caffe/util/ocl_wrapper.hpp delete mode 100644 src/caffe/net.cpp create mode 100644 src/caffe/util/ocl_wrapper.cpp diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh index 98c05c59..e62279e2 100755 --- a/examples/imagenet/train_alexnet.sh +++ b/examples/imagenet/train_alexnet.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh -./build/tools/caffe train \ +GLOG_logtostderr=1 ./build/tools/caffe train \ --solver=models/bvlc_alexnet/solver.prototxt diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh new file mode 100755 index 00000000..5f3d3326 --- /dev/null +++ b/examples/imagenet/train_alexnet_without_dropout.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +GLOG_logtostderr=1 ./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_without_dropout.prototxt diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh new file mode 100755 index 00000000..15625f8a --- /dev/null +++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +GLOG_logtostderr=1 ./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index b1528474..e0703056 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -21,6 +21,7 @@ #include "caffe/device.hpp" #include "caffe/util/device_alternate.hpp" +#include "caffe/util/ocl_wrapper.hpp" // gflags 2.1 issue: namespace google was changed to gflags without warning. // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version @@ -88,6 +89,22 @@ private:\ } \ } while(0) +//sample #num data from Blob_ +#define CHECK_BLOB_DATA(Blob_, num, marker) \ +do{ \ + const Dtype *top_cpu_data = Blob_->cpu_data(); \ + size_t top_cpu_data_count = Blob_->count(); \ + size_t sample_interval = top_cpu_data_count/num; \ + if(sample_interval == 0){ \ + sample_interval=1; \ + } \ + printf("%s: ", marker); \ + for(int i=0; i void im2col_gpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, @@ -27,6 +28,40 @@ void col2im_gpu(const Dtype* data_col, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); + +template +void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); + +template +void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, const int optnum); + +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im, const int img_offset); + +template +void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, const int optnum); + +template +void col2im_gpu_ocl(cl_mem data_col, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, cl_kernel Kernel); + +template +void im2col_gpu_ocl(cl_mem data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, cl_kernel Kernel); } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp new file mode 100644 index 00000000..df9e855e --- /dev/null +++ b/include/caffe/util/ocl_wrapper.hpp @@ -0,0 +1,67 @@ +// Copyright 2014 AMD DNN contributors. + +#ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_ +#define _CAFFE_UTIL_OCL_WRAPPER_HPP_ + +namespace caffe { + +template +void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); + +template +void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, + const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum); + +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data); + +template +void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out); + +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data); + +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss); + +template +void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data); + +template +void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const Dtype* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); + +template +void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data); + +template +void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff); + +template +void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); + +template +void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); + +template +void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); + +template +void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ); +} // namespace caffe + +#endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index a6bd86a9..21c72bba 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -25,6 +25,7 @@ class BaseConvolutionLayer : public Layer { public: explicit BaseConvolutionLayer(const LayerParameter& param) : Layer(param) {} + virtual ~BaseConvolutionLayer(); virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, @@ -46,6 +47,8 @@ class BaseConvolutionLayer : public Layer { void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights); void backward_cpu_bias(Dtype* bias, const Dtype* input); +//opencl related setup + void ocl_setup(); #ifndef CPU_ONLY void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, @@ -88,12 +91,16 @@ class BaseConvolutionLayer : public Layer { } #ifndef CPU_ONLY inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { - im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); +// im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, +// kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); + im2col_gpu(im2col_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0); } inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + // col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, + // kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + col2im_gpu(col2im_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, pad_h_, stride_h_, data, bottom_offset_); } #endif @@ -109,6 +116,20 @@ class BaseConvolutionLayer : public Layer { Blob col_buffer_; Blob bias_multiplier_; + +//opencl related data structures +protected: + cl_kernel im2col_kernel, col2im_kernel; + cl_kernel oclmem_kernel; + cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat; + cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; + cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel; +public: + static cl_mem subTopMem, transMem; + static size_t subtop_mem_size, trans_mem_size; + +public: + size_t top_offset_, bottom_offset_; }; /** diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 980dc37c..d132efe8 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -743,6 +743,17 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size) } } +template +__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ + int gdx = get_global_id(0); + if(gdx < N){ + Y[gdx] =((0.0 __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ int index=get_global_id(0); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 052281d4..407668c9 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -47,9 +47,11 @@ void GlobalInit(int* pargc, char*** pargv) { #ifdef CPU_ONLY // CPU-only Caffe. Caffe::Caffe() - : random_generator_(), mode_(Caffe::CPU) { } + : random_generator_(), mode_(Caffe::CPU) { + } -Caffe::~Caffe() { } +Caffe::~Caffe() { +} void Caffe::set_random_seed(const unsigned int seed) { // RNG seed @@ -106,6 +108,14 @@ Caffe::Caffe() LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } */ + cl_int err = clblasSetup(); + if(err != CL_SUCCESS){ + LOG(ERROR) << "clBLAS setup failed "<::DataTransformer(const TransformationParameter& param, ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); data_mean_.FromProto(blob_proto); } + printf("before if\n"); // check if we want to use mean_value if (param_.mean_value_size() > 0) { CHECK(param_.has_mean_file() == false) << @@ -32,6 +33,7 @@ DataTransformer::DataTransformer(const TransformationParameter& param, mean_values_.push_back(param_.mean_value(c)); } } + printf("reaches here\n"); } template diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index ccb3adc7..38d8952d 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -8,6 +8,63 @@ namespace caffe { +#ifdef use_packing_scheme +template size_t BaseConvolutionLayer::subtop_mem_size = sizeof(Dtype); +template size_t BaseConvolutionLayer::trans_mem_size = sizeof(Dtype); +template cl_mem BaseConvolutionLayer::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::subtop_mem_size, NULL, NULL); +template cl_mem BaseConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); +#endif + +template +void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) +{ + if(subtop_size > BaseConvolutionLayer::subtop_mem_size){ + ConvolutionLayer::subtop_mem_size = subtop_size; + clReleaseMemObject(ConvolutionLayer::subTopMem); + ConvolutionLayer::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::subtop_mem_size, NULL, NULL); + } + if(trans_size > ConvolutionLayer::trans_mem_size){ + ConvolutionLayer::trans_mem_size = trans_size; + clReleaseMemObject(ConvolutionLayer::transMem); + ConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); + } +} + +template +void BaseConvolutionLayer::ocl_setup() { + im2col_kernel = clCreateKernel(amdDevice.Program,"im2colfloat", NULL); + col2im_kernel = clCreateKernel(amdDevice.Program,"col2imfloat", NULL); + oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); + im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); + col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL); + opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL); + ocl_Kernel_im2colfloat = clCreateKernel(amdDevice.Program,"im2colfloat_yuan",NULL); + ocl_Kernel_col2imfloat = clCreateKernel(amdDevice.Program,"col2imfloat_yuan",NULL); + ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL); + ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL); + +#ifdef use_packing_scheme + size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); + size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); + Alloc_public_tmp_mem(subtop_size, trans_size); +#endif +} + + +template + BaseConvolutionLayer::~BaseConvolutionLayer(){ + OCL_CHECK( clReleaseKernel(im2col_kernel) ); + OCL_CHECK( clReleaseKernel(col2im_kernel) ); + OCL_CHECK( clReleaseKernel(oclmem_kernel) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_im2colfloat) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_col2imfloat) ); + OCL_CHECK( clReleaseKernel(im2col_opt_kernel) ); + OCL_CHECK( clReleaseKernel(col2im_opt_kernel) ); +} + + template void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { @@ -68,6 +125,10 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, conv_out_channels_ = num_output_; conv_in_channels_ = channels_; } + + //initializa OpenCL kernels and cl_mem objects + ocl_setup(); + // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights // - blobs_[1] holds the biases (optional) @@ -234,20 +295,31 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, } col_buff = col_buffer_.gpu_data(); } + for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / + /*caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, (Dtype)0., output + output_offset_ * g); - } + */ + //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count()); + caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, + conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, + (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, + (Dtype)0., output, top_offset_+output_offset_ * g); + } } template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, const Dtype* bias) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, + /*caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(), - (Dtype)1., output); + (Dtype)1., output);*/ + caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, num_output_, + height_out_*width_out_, 1, (Dtype)1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype)1., output, top_offset_); } template @@ -258,13 +330,18 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, col_buff = input; } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, + /* caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, (Dtype)0., col_buff + col_offset_ * g); + */ + caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype)1., weights, weight_offset_ * g, + output, top_offset_+output_offset_ * g, + (Dtype)0., col_buff, col_offset_ * g); } if (!is_1x1_) { - conv_col2im_gpu(col_buff, input); + conv_col2im_gpu(col_buff, input); } } @@ -277,18 +354,26 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, col_buff = col_buffer_.gpu_data(); } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, + /* caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype)1., weights + weight_offset_ * g); - } + (Dtype)1., weights + weight_offset_ * g);*/ + caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype)1., output, top_offset_, + (Dtype*)col_buff, col_offset_ * g, (Dtype)1., + (Dtype*)weights, weight_offset_ * g); + } } template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input) { - caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.gpu_data(), 1., bias); + /* caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., + input, bias_multiplier_.gpu_data(), 1., bias);*/ + caffe_gpu_gemvv(CblasNoTrans, num_output_, height_out_*width_out_, + (Dtype)1., input, top_offset_, height_out_*width_out_, + reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, + bias, (size_t)0, 1); } #endif // !CPU_ONLY diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 7169d3fd..b768f05f 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -75,17 +75,7 @@ void BasePrefetchingDataLayer::Forward_cpu( top[1]->mutable_cpu_data()); } - //sample <=20 data from top_data and display - const Dtype *top_cpu_data = (top)[0]->cpu_data(); - size_t top_cpu_data_count = (top)[0]->count(); - size_t sample_interval = top_cpu_data_count/20; - if(sample_interval == 0){ - sample_interval=1; - } - for(int i=0; i::Forward_gpu(const vector*>& bo JoinPrefetchThread(); // Copy the data from prefetch thread to data_layer //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); + top[0]->ReshapeLike(prefetch_data_); + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); } clFinish(amdDevice.CommandQueue); + #ifdef Track_data_transfer #endif - -//sample <=20 data from top_data and display - const Dtype *top_cpu_data = (top)[0]->cpu_data(); - size_t top_cpu_data_count = (top)[0]->count(); - size_t sample_interval = top_cpu_data_count/20; - if(sample_interval == 0){ - sample_interval=1; - } - for(int i=0; i::Forward_cpu(const vector*>& bottom, } } } + + CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -65,16 +67,77 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } + + CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + //two intermediate variables to pass offset + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->forward_gpu_gemm(bottom_data, weight, + top_data); + + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } + + //Forward_cpu(bottom, top); + CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + CHECK_BLOB_DATA(top[0],20, "top[0]"); } template void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + // + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(bottom_data, + top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm(top_diff, weight, + bottom_diff); + } + } + } + } + + CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 7f1ac8f6..4239443d 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -70,11 +70,13 @@ void DropoutLayer::Backward_cpu(const vector*>& top, template void DropoutLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void DropoutLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 4d25215a..8edd6148 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -122,11 +122,13 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, template void InnerProductLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void InnerProductLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 47fa5ed5..e49e2963 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -250,21 +250,25 @@ void LRNLayer::WithinChannelBackward( template void LRNLayer::CrossChannelForward_gpu(const vector*>& bottom, const vector*>& top){ + CrossChannelForward_cpu(bottom, top); } template void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + CrossChannelBackward_gpu(top, propagate_down, bottom); } template void LRNLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void LRNLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index d5207889..97a5c150 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -312,11 +312,13 @@ void PoolingLayer::Backward_cpu(const vector*>& top, template void PoolingLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void PoolingLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index e05080bf..ce85b1cc 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -39,11 +39,13 @@ void ReLULayer::Backward_cpu(const vector*>& top, template void ReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void ReLULayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 488e836a..973db6e7 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -89,11 +89,13 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void SoftmaxLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 6380f264..072f9f71 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -123,11 +123,13 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, template void SoftmaxWithLossLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 932b240b..1894d0f1 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -52,11 +52,13 @@ void SplitLayer::Backward_cpu(const vector*>& top, template void SplitLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Forward_cpu(bottom, top); } template void SplitLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + Backward_cpu(top, propagate_down, bottom); } #ifdef CPU_ONLY diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp deleted file mode 100644 index a18ee638..00000000 --- a/src/caffe/net.cpp +++ /dev/null @@ -1,852 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/net.hpp" -#include "caffe/proto/caffe.pb.h" -#include "caffe/util/insert_splits.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/util/upgrade_proto.hpp" - -#include "caffe/test/test_caffe_main.hpp" - -namespace caffe { - -template -Net::Net(const NetParameter& param) { - Init(param); -} - -template -Net::Net(const string& param_file, Phase phase) { - NetParameter param; - ReadNetParamsFromTextFileOrDie(param_file, ¶m); - param.mutable_state()->set_phase(phase); - Init(param); -} - -template -void Net::Init(const NetParameter& in_param) { - // Set phase from the state. - phase_ = in_param.state().phase(); - // Filter layers based on their include/exclude rules and - // the current NetState. - NetParameter filtered_param; - FilterNet(in_param, &filtered_param); - LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); - // Create a copy of filtered_param with splits added where necessary. - NetParameter param; - InsertSplits(filtered_param, ¶m); - // Basically, build all the layers and set up their connections. - name_ = param.name(); - map blob_name_to_idx; - set available_blobs; - CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) - << "Must specify either input_shape OR deprecated input_dim, not both."; - if (param.input_dim_size() > 0) { - // Deprecated 4D dimensions. - CHECK_EQ(param.input_size() * 4, param.input_dim_size()) - << "Incorrect input blob dimension specifications."; - } else { - CHECK_EQ(param.input_size(), param.input_shape_size()) - << "Exactly one input_shape must be specified per input."; - } - memory_used_ = 0; - // set the input blobs - for (int input_id = 0; input_id < param.input_size(); ++input_id) { - const int layer_id = -1; // inputs have fake layer ID -1 - AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - // For each layer, set up its input and output - bottom_vecs_.resize(param.layer_size()); - top_vecs_.resize(param.layer_size()); - bottom_id_vecs_.resize(param.layer_size()); - param_id_vecs_.resize(param.layer_size()); - top_id_vecs_.resize(param.layer_size()); - bottom_need_backward_.resize(param.layer_size()); - for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { - // Inherit phase from net if unset. - if (!param.layer(layer_id).has_phase()) { - param.mutable_layer(layer_id)->set_phase(phase_); - } - // Setup layer. - const LayerParameter& layer_param = param.layer(layer_id); - if (layer_param.propagate_down_size() > 0) { - CHECK_EQ(layer_param.propagate_down_size(), - layer_param.bottom_size()) - << "propagate_down param must be specified " - << "either 0 or bottom_size times "; - } - layers_.push_back(LayerRegistry::CreateLayer(layer_param)); - layer_names_.push_back(layer_param.name()); - LOG(INFO) << "Creating Layer " << layer_param.name(); - bool need_backward = false; - - // Figure out this layer's input and output - for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); - ++bottom_id) { - const int blob_id = AppendBottom(param, layer_id, bottom_id, - &available_blobs, &blob_name_to_idx); - // If a blob needs backward, this layer should provide it. - need_backward |= blob_need_backward_[blob_id]; - } - int num_top = layer_param.top_size(); - for (int top_id = 0; top_id < num_top; ++top_id) { - AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); - } - // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter - // specified fewer than the required number (as specified by - // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. - Layer* layer = layers_[layer_id].get(); - if (layer->AutoTopBlobs()) { - const int needed_num_top = - std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); - for (; num_top < needed_num_top; ++num_top) { - // Add "anonymous" top blobs -- do not modify available_blobs or - // blob_name_to_idx as we don't want these blobs to be usable as input - // to other layers. - AppendTop(param, layer_id, num_top, NULL, NULL); - } - } - // After this layer is connected, set it up. - LOG(INFO) << "Setting up " << layer_names_[layer_id]; - layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { - blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); - } - blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); - LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); - if (layer->loss(top_id)) { - LOG(INFO) << " with loss weight " << layer->loss(top_id); - } - memory_used_ += top_vecs_[layer_id][top_id]->count(); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - const int param_size = layer_param.param_size(); - const int num_param_blobs = layers_[layer_id]->blobs().size(); - CHECK_LE(param_size, num_param_blobs) - << "Too many params specified for layer " << layer_param.name(); - ParamSpec default_param_spec; - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - const ParamSpec* param_spec = (param_id < param_size) ? - &layer_param.param(param_id) : &default_param_spec; - const bool param_need_backward = param_spec->lr_mult() > 0; - need_backward |= param_need_backward; - layers_[layer_id]->set_param_propagate_down(param_id, - param_need_backward); - } - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - AppendParam(param, layer_id, param_id); - } - // Finally, set the backward flag - layer_need_backward_.push_back(need_backward); - if (need_backward) { - for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { - blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; - } - } - } - // Go through the net backwards to determine which blobs contribute to the - // loss. We can skip backward computation for blobs that don't contribute - // to the loss. - // Also checks if all bottom blobs don't need backward computation (possible - // because the skip_propagate_down param) and so we can skip bacward - // computation for the entire layer - set blobs_under_loss; - set blobs_skip_backp; - for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { - bool layer_contributes_loss = false; - bool layer_skip_propagate_down = true; - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - if (layers_[layer_id]->loss(top_id) || - (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { - layer_contributes_loss = true; - } - if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { - layer_skip_propagate_down = false; - } - if (layer_contributes_loss && !layer_skip_propagate_down) - break; - } - // If this layer can skip backward computation, also all his bottom blobs - // don't need backpropagation - if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { - layer_need_backward_[layer_id] = false; - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = false; - } - } - if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; } - if (layer_need_backward_[layer_id]) { - LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; - } else { - LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; - } - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - if (layer_contributes_loss) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_under_loss.insert(blob_name); - } else { - bottom_need_backward_[layer_id][bottom_id] = false; - } - if (!bottom_need_backward_[layer_id][bottom_id]) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_skip_backp.insert(blob_name); - } - } - } - // Handle force_backward if needed. - if (param.force_backward()) { - for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { - layer_need_backward_[layer_id] = true; - for (int bottom_id = 0; - bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = - bottom_need_backward_[layer_id][bottom_id] || - layers_[layer_id]->AllowForceBackward(bottom_id); - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || - bottom_need_backward_[layer_id][bottom_id]; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - layers_[layer_id]->set_param_propagate_down(param_id, true); - } - } - } - // In the end, all remaining blobs are considered output blobs. - for (set::iterator it = available_blobs.begin(); - it != available_blobs.end(); ++it) { - LOG(INFO) << "This network produces output " << *it; - net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); - net_output_blob_indices_.push_back(blob_name_to_idx[*it]); - } - for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { - blob_names_index_[blob_names_[blob_id]] = blob_id; - } - for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { - layer_names_index_[layer_names_[layer_id]] = layer_id; - } - GetLearningRateAndWeightDecay(); - debug_info_ = param.debug_info(); - LOG(INFO) << "Network initialization done."; - LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); -} - -template -void Net::FilterNet(const NetParameter& param, - NetParameter* param_filtered) { - NetState net_state(param.state()); - param_filtered->CopyFrom(param); - param_filtered->clear_layer(); - for (int i = 0; i < param.layer_size(); ++i) { - const LayerParameter& layer_param = param.layer(i); - const string& layer_name = layer_param.name(); - CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) - << "Specify either include rules or exclude rules; not both."; - // If no include rules are specified, the layer is included by default and - // only excluded if it meets one of the exclude rules. - bool layer_included = (layer_param.include_size() == 0); - for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { - layer_included = false; - } - } - for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { - layer_included = true; - } - } - if (layer_included) { - param_filtered->add_layer()->CopyFrom(layer_param); - } - } -} - -template -bool Net::StateMeetsRule(const NetState& state, - const NetStateRule& rule, const string& layer_name) { - // Check whether the rule is broken due to phase. - if (rule.has_phase()) { - if (rule.phase() != state.phase()) { - LOG(INFO) << "The NetState phase (" << state.phase() - << ") differed from the phase (" << rule.phase() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to min level. - if (rule.has_min_level()) { - if (state.level() < rule.min_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the min_level (" << rule.min_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to max level. - if (rule.has_max_level()) { - if (state.level() > rule.max_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the max_level (" << rule.max_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to stage. The NetState must - // contain ALL of the rule's stages to meet it. - for (int i = 0; i < rule.stage_size(); ++i) { - // Check that the NetState contains the rule's ith stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.stage(i) == state.stage(j)) { has_stage = true; } - } - if (!has_stage) { - LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to not_stage. The NetState must - // contain NONE of the rule's not_stages to meet it. - for (int i = 0; i < rule.not_stage_size(); ++i) { - // Check that the NetState contains the rule's ith not_stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.not_stage(i) == state.stage(j)) { has_stage = true; } - } - if (has_stage) { - LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - return true; -} - -// Helper for Net::Init: add a new input or top blob to the net. (Inputs have -// layer_id == -1, tops have layer_id >= 0.) -template -void Net::AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx) { - shared_ptr layer_param((layer_id >= 0) ? - (new LayerParameter(param.layer(layer_id))) : NULL); - const string& blob_name = layer_param ? - (layer_param->top_size() > top_id ? - layer_param->top(top_id) : "(automatic)") : param.input(top_id); - // Check if we are doing in-place computation - if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && - blob_name == layer_param->bottom(top_id)) { - // In-place computation - LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; - top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); - top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); - } else if (blob_name_to_idx && - blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { - // If we are not doing in-place computation but have duplicated blobs, - // raise an error. - LOG(FATAL) << "Duplicate blobs produced by multiple sources."; - } else { - // Normal output. - if (layer_param) { - LOG(INFO) << layer_param->name() << " -> " << blob_name; - } else { - LOG(INFO) << "Input " << top_id << " -> " << blob_name; - } - shared_ptr > blob_pointer(new Blob()); - const int blob_id = blobs_.size(); - blobs_.push_back(blob_pointer); - blob_names_.push_back(blob_name); - blob_need_backward_.push_back(false); - if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; } - if (layer_id == -1) { - // Set the (explicitly specified) dimensions of the input blob. - if (param.input_dim_size() > 0) { - blob_pointer->Reshape(param.input_dim(top_id * 4), - param.input_dim(top_id * 4 + 1), - param.input_dim(top_id * 4 + 2), - param.input_dim(top_id * 4 + 3)); - } else { - blob_pointer->Reshape(param.input_shape(top_id)); - } - net_input_blob_indices_.push_back(blob_id); - net_input_blobs_.push_back(blob_pointer.get()); - } else { - top_id_vecs_[layer_id].push_back(blob_id); - top_vecs_[layer_id].push_back(blob_pointer.get()); - } - } - if (available_blobs) { available_blobs->insert(blob_name); } -} - -// Helper for Net::Init: add a new bottom blob to the net. -template -int Net::AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx) { - const LayerParameter& layer_param = param.layer(layer_id); - const string& blob_name = layer_param.bottom(bottom_id); - if (available_blobs->find(blob_name) == available_blobs->end()) { - LOG(FATAL) << "Unknown blob input " << blob_name - << " (at index " << bottom_id << ") to layer " << layer_id; - } - const int blob_id = (*blob_name_to_idx)[blob_name]; - LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; - bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); - bottom_id_vecs_[layer_id].push_back(blob_id); - available_blobs->erase(blob_name); - bool propagate_down = true; - // Check if the backpropagation on bottom_id should be skipped - if (layer_param.propagate_down_size() > 0) - propagate_down = layer_param.propagate_down(bottom_id); - const bool need_backward = blob_need_backward_[blob_id] && - propagate_down; - bottom_need_backward_[layer_id].push_back(need_backward); - return blob_id; -} - -template -void Net::AppendParam(const NetParameter& param, const int layer_id, - const int param_id) { - const LayerParameter& layer_param = layers_[layer_id]->layer_param(); - const int param_size = layer_param.param_size(); - string param_name = - (param_size > param_id) ? layer_param.param(param_id).name() : ""; - if (param_name.size()) { - param_display_names_.push_back(param_name); - } else { - ostringstream param_display_name; - param_display_name << param_id; - param_display_names_.push_back(param_display_name.str()); - } - const int net_param_id = params_.size(); - params_.push_back(layers_[layer_id]->blobs()[param_id]); - param_id_vecs_[layer_id].push_back(net_param_id); - param_layer_indices_.push_back(make_pair(layer_id, param_id)); - if (!param_size || !param_name.size() || (param_name.size() && - param_names_index_.find(param_name) == param_names_index_.end())) { - // This layer "owns" this parameter blob -- it is either anonymous - // (i.e., not given a param_name) or explicitly given a name that we - // haven't already seen. - param_owners_.push_back(-1); - if (param_name.size()) { - param_names_index_[param_name] = net_param_id; - } - } else { - // Named param blob with name we've seen before: share params - const int owner_net_param_id = param_names_index_[param_name]; - param_owners_.push_back(owner_net_param_id); - const pair& owner_index = - param_layer_indices_[owner_net_param_id]; - const int owner_layer_id = owner_index.first; - const int owner_param_id = owner_index.second; - LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; - Blob* this_blob = layers_[layer_id]->blobs()[param_id].get(); - Blob* owner_blob = - layers_[owner_layer_id]->blobs()[owner_param_id].get(); - const int param_size = layer_param.param_size(); - if (param_size > param_id && (layer_param.param(param_id).share_mode() == - ParamSpec_DimCheckMode_PERMISSIVE)) { - // Permissive dimension checking -- only check counts are the same. - CHECK_EQ(this_blob->count(), owner_blob->count()) - << "Shared parameter blobs must have the same count."; - } else { - // Strict dimension checking -- all dims must be the same. - CHECK(this_blob->shape() == owner_blob->shape()); - } - layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); - } -} - -template -void Net::GetLearningRateAndWeightDecay() { - LOG(INFO) << "Collecting Learning Rate and Weight Decay."; - ParamSpec default_param_spec; - for (int i = 0; i < layers_.size(); ++i) { - vector > >& layer_blobs = layers_[i]->blobs(); - for (int j = 0; j < layer_blobs.size(); ++j) { - const ParamSpec* param_spec = - (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; - params_lr_.push_back(param_spec->lr_mult()); - params_weight_decay_.push_back(param_spec->decay_mult()); - } - } -} - -template -Dtype Net::ForwardFromTo(int start, int end) { - CHECK_GE(start, 0); - CHECK_LT(end, layers_.size()); - Dtype loss = 0; - if (debug_info_) { - for (int i = 0; i < net_input_blobs_.size(); ++i) { - InputDebugInfo(i); - } - } - for (int i = start; i <= end; ++i) { - // LOG(ERROR) << "Forwarding " << layer_names_[i]; - Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); - loss += layer_loss; - if (debug_info_) { ForwardDebugInfo(i); } - } - return loss; -} - -template -Dtype Net::ForwardFrom(int start) { - return ForwardFromTo(start, layers_.size() - 1); -} - -template -Dtype Net::ForwardTo(int end) { - return ForwardFromTo(0, end); -} - -template -const vector*>& Net::ForwardPrefilled(Dtype* loss) { - if (loss != NULL) { - *loss = ForwardFromTo(0, layers_.size() - 1); - } else { - ForwardFromTo(0, layers_.size() - 1); - } - return net_output_blobs_; -} - -template -const vector*>& Net::Forward( - const vector*> & bottom, Dtype* loss) { - // Copy bottom to internal bottom - for (int i = 0; i < bottom.size(); ++i) { - net_input_blobs_[i]->CopyFrom(*bottom[i]); - } - return ForwardPrefilled(loss); -} - -template -string Net::Forward(const string& input_blob_protos, Dtype* loss) { - BlobProtoVector blob_proto_vec; - if (net_input_blobs_.size()) { - blob_proto_vec.ParseFromString(input_blob_protos); - CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) - << "Incorrect input size."; - for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { - net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); - } - } - ForwardPrefilled(loss); - blob_proto_vec.Clear(); - for (int i = 0; i < net_output_blobs_.size(); ++i) { - net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); - } - string output; - blob_proto_vec.SerializeToString(&output); - return output; -} - -template -void Net::BackwardFromTo(int start, int end) { - CHECK_GE(end, 0); - CHECK_LT(start, layers_.size()); - for (int i = start; i >= end; --i) { - if (layer_need_backward_[i]) { - layers_[i]->Backward( - top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); - if (debug_info_) { BackwardDebugInfo(i); } - } - } -} - -template -void Net::InputDebugInfo(const int input_id) { - const Blob& blob = *net_input_blobs_[input_id]; - const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; -} - -template -void Net::ForwardDebugInfo(const int layer_id) { - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const Blob& blob = *top_vecs_[layer_id][top_id]; - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << data_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const int net_param_id = param_id_vecs_[layer_id][param_id]; - const string& blob_name = param_display_names_[net_param_id]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name - << " data: " << data_abs_val_mean; - } -} - -template -void Net::BackwardDebugInfo(const int layer_id) { - const vector*>& bottom_vec = bottom_vecs_[layer_id]; - for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { - if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } - const Blob& blob = *bottom_vec[bottom_id]; - const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; - } -} - -template -void Net::UpdateDebugInfo(const int param_id) { - const Blob& blob = *params_[param_id]; - const int param_owner = param_owners_[param_id]; - const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; - const string& param_display_name = param_display_names_[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - if (param_owner < 0) { - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; - } else { - const string& owner_layer_name = - layer_names_[param_layer_indices_[param_owner].first]; - LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " - << "param " << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << diff_abs_val_mean; - } -} - -template -void Net::ShareTrainedLayersWith(const Net* other) { - int num_source_layers = other->layers().size(); - for (int i = 0; i < num_source_layers; ++i) { - Layer* source_layer = other->layers()[i].get(); - const string& source_layer_name = other->layer_names()[i]; - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - Blob* source_blob = source_layer->blobs()[j].get(); - CHECK(target_blobs[j]->shape() == source_blob->shape()); - target_blobs[j]->ShareData(*source_blob); - } - } -} - -template -void Net::BackwardFrom(int start) { - BackwardFromTo(start, 0); -} - -template -void Net::BackwardTo(int end) { - BackwardFromTo(layers_.size() - 1, end); -} - -template -void Net::Backward() { - BackwardFromTo(layers_.size() - 1, 0); - if (debug_info_) { - Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - asum_data += params_[i]->asum_data(); - asum_diff += params_[i]->asum_diff(); - sumsq_data += params_[i]->sumsq_data(); - sumsq_diff += params_[i]->sumsq_diff(); - } - const Dtype l2norm_data = std::sqrt(sumsq_data); - const Dtype l2norm_diff = std::sqrt(sumsq_diff); - LOG(ERROR) << " [Backward] All net params (data, diff): " - << "L1 norm = (" << asum_data << ", " << asum_diff << "); " - << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; - } -} - -template -void Net::Reshape() { - for (int i = 0; i < layers_.size(); ++i) { - layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); - } -} - -template -void Net::CopyTrainedLayersFrom(const NetParameter& param) { - int num_source_layers = param.layer_size(); - for (int i = 0; i < num_source_layers; ++i) { - const LayerParameter& source_layer = param.layer(i); - const string& source_layer_name = source_layer.name(); - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - const bool kReshape = false; - target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); - } - } -} - -template -void Net::CopyTrainedLayersFrom(const string trained_filename) { - NetParameter param; - ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); - CopyTrainedLayersFrom(param); -} - -template -void Net::ToProto(NetParameter* param, bool write_diff) const { - param->Clear(); - param->set_name(name_); - // Add bottom and top - for (int i = 0; i < net_input_blob_indices_.size(); ++i) { - param->add_input(blob_names_[net_input_blob_indices_[i]]); - } - DLOG(INFO) << "Serializing " << layers_.size() << " layers"; - for (int i = 0; i < layers_.size(); ++i) { - LayerParameter* layer_param = param->add_layer(); - for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { - layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); - } - for (int j = 0; j < top_id_vecs_[i].size(); ++j) { - layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); - } - layers_[i]->ToProto(layer_param, write_diff); - } -} - -template -void Net::Update() { - // First, accumulate the diffs of any shared parameters into their owner's - // diff. (Assumes that the learning rate, weight decay, etc. have already been - // accounted for in the current diff.) - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - const int count = params_[i]->count(); - const Dtype* this_diff; - Dtype* owner_diff; - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; - case Caffe::GPU: -#ifndef CPU_ONLY - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); -#else - NO_GPU; -#endif - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } - } - // Now, update the owned parameters. - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - params_[i]->Update(); - } -} - -template -bool Net::has_blob(const string& blob_name) const { - return blob_names_index_.find(blob_name) != blob_names_index_.end(); -} - -template -const shared_ptr > Net::blob_by_name( - const string& blob_name) const { - shared_ptr > blob_ptr; - if (has_blob(blob_name)) { - blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; - } else { - blob_ptr.reset((Blob*)(NULL)); - LOG(WARNING) << "Unknown blob name " << blob_name; - } - return blob_ptr; -} - -template -bool Net::has_layer(const string& layer_name) const { - return layer_names_index_.find(layer_name) != layer_names_index_.end(); -} - -template -const shared_ptr > Net::layer_by_name( - const string& layer_name) const { - shared_ptr > layer_ptr; - if (has_layer(layer_name)) { - layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; - } else { - layer_ptr.reset((Layer*)(NULL)); - LOG(WARNING) << "Unknown layer name " << layer_name; - } - return layer_ptr; -} - -INSTANTIATE_CLASS(Net); - -} // namespace caffe diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index bbac8fb5..87f746d8 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -37,6 +37,7 @@ void Solver::Init(const SolverParameter& param) { //#ifndef CPU_ONLY //AMD device related initialization amdDevice.Init(); +// cl_int err = clblasSetup(); //#else // NO_GPU; //#endif @@ -519,6 +520,7 @@ void SGDSolver::Normalize(int param_id) { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); + CHECK_BLOB_DATA(net_params[param_id], 10, "NORM"); #else NO_GPU; #endif @@ -537,6 +539,15 @@ void SGDSolver::Regularize(int param_id) { Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + Dtype *cpu_diff = net_params[param_id]->mutable_cpu_diff(); + printf("cpu diff before reg\n"); + for(int i=0; i<10; i++) + printf("%f,",cpu_diff[i]); + printf("\n"); + + + switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) { @@ -589,6 +600,18 @@ void SGDSolver::Regularize(int param_id) { default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } + CHECK_BLOB_DATA(net_params[param_id], 10, "REGU"); + cpu_diff = net_params[param_id]->mutable_cpu_diff(); + printf("cpu diff\n"); + for(int i=0; i<10; i++) + printf("%f,",cpu_diff[i]); + printf("\n"); + + cpu_diff = temp_[param_id]->mutable_cpu_diff(); + printf("tmp\n"); + for(int i=0; i<10; i++) + printf("%f,",cpu_diff[i]); + printf("\n"); } template @@ -613,9 +636,11 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { caffe_gpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), momentum, history_[param_id]->mutable_gpu_data()); - caffe_copy(net_params[param_id]->count(), + caffe_gpu_copy(net_params[param_id]->count(), history_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); + +CHECK_BLOB_DATA(net_params[param_id], 10, "COMPUTATE"); #else NO_GPU; #endif @@ -693,7 +718,7 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->update_[param_id]->mutable_gpu_data()); // copy - caffe_copy(net_params[param_id]->count(), + caffe_gpu_copy(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 1d269c35..a8c5a83f 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -15,8 +15,8 @@ Timer::Timer() Timer::~Timer() { if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY - CUDA_CHECK(cudaEventDestroy(start_gpu_)); - CUDA_CHECK(cudaEventDestroy(stop_gpu_)); + // CUDA_CHECK(cudaEventDestroy(start_gpu_)); + // CUDA_CHECK(cudaEventDestroy(stop_gpu_)); #else NO_GPU; #endif @@ -108,8 +108,8 @@ void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY - CUDA_CHECK(cudaEventCreate(&start_gpu_)); - CUDA_CHECK(cudaEventCreate(&stop_gpu_)); + // CUDA_CHECK(cudaEventCreate(&start_gpu_)); + // CUDA_CHECK(cudaEventCreate(&stop_gpu_)); #else NO_GPU; #endif diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 6545d98c..ac44f425 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -2,6 +2,7 @@ #include #include +#include "caffe/common.hpp" #include "caffe/util/im2col.hpp" #include "caffe/util/math_functions.hpp" @@ -81,13 +82,14 @@ template void col2im_cpu(const double* data_col, const int channels, const int stride_w, double* data_im); - +/* template void im2col_gpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { + } @@ -100,8 +102,8 @@ template void im2col_gpu(const double* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col); - - +*/ +/* template void col2im_gpu(const Dtype* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, @@ -118,5 +120,243 @@ template void col2im_gpu(const double* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im); +*/ +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + clFinish(amdDevice.CommandQueue); +} + +template void im2col_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset); +template void im2col_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset); + +template +void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = 16 * channels * height_col * width_col; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256 - 256 % width_col}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + +template void im2col_16_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset); +template void im2col_16_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset); + +template +void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, const int optnum) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = optnum * channels * height_col * width_col; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset); + ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256 - 256 % width_col}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + +template void im2col_opt_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset, const int optnum); +template void im2col_opt_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset, const int optnum); + +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operatiors) + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + + +template void col2im_gpu(cl_kernel Kernel, const float* data_col, const int col_offset, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, float* data_im, const int img_offset); +template void col2im_gpu(cl_kernel Kernel, const double* data_col, const int col_offset, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, double* data_im, const int img_offset); + +template +void im2col_gpu_ocl(cl_mem data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, cl_kernel Kernel) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col); + OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) ); + + //std::cout<<"num_kernels"<(cl_mem data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, cl_kernel Kernel); +template void im2col_gpu_ocl(cl_mem data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, cl_kernel Kernel); + +template +void col2im_gpu_ocl(cl_mem data_col, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, cl_kernel Kernel) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operatiors) + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_im); + + if(ret!=CL_SUCCESS){ + fprintf(stderr,"Failed to Set Args\n"); + } + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {64}; + cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL); + if(CL_SUCCESS!=iStatus){ + fprintf(stderr,"Failed to enqueue kernel\n"); + } +} + + +template void col2im_gpu_ocl(cl_mem data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, float* data_im, cl_kernel Kernel); +template void col2im_gpu_ocl(cl_mem data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, double* data_im, cl_kernel Kernel); } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 17c2b414..cf9b1ca5 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -542,9 +542,9 @@ template <> void caffe_gpu_asum(const int n, const double* x, double* y) { } -DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - - (x[index] < Dtype(0))); -DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) + // - (x[index] < Dtype(0))); +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign); INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit); @@ -609,6 +609,18 @@ void mul_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { } +template<> +void caffe_gpu_sign(const int N, const float *X, float *Y){ + cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); + caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); +} + +template<> +void caffe_gpu_sign(const int N, const double *X, double *Y){ + cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); + caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); +} + template <> void caffe_gpu_mul(const int N, const float* a, const float* b, float* y) { diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp new file mode 100644 index 00000000..32a477fc --- /dev/null +++ b/src/caffe/util/ocl_wrapper.cpp @@ -0,0 +1,447 @@ +// Copyright 2014 AMD DNN contributors. + +#include +#include +#include +#include +#include +#include "caffe/common.hpp" +#include "caffe/util/ocl_util.hpp" +namespace caffe { + +template +void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ + cl_int ret; + ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src); + OCL_CHECK(ret); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst); + OCL_CHECK(ret); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset); + OCL_CHECK(ret); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_); + OCL_CHECK(ret); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_); + OCL_CHECK(ret); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size2[]={M_ * packing_num}; + size_t uiLocal_Work_Size2[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) ); +} + +template void transform_gpu(cl_kernel Kernel, float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num); +template void transform_gpu(cl_kernel Kernel, double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num); + +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) ); + + size_t Global_Work_Size[1] = {num}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +// Explicit instantiation +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data); +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data); + + +template +void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {num}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +// Explicit instantiation +template void exp_gpu(cl_kernel Kernel, const int num, const float* data, float* out); +template void exp_gpu(cl_kernel Kernel, const int num, const double* data, double* out); + +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); + + size_t Global_Work_Size[1] = {num*dim}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +// Explicit instantiation +template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data); +template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data); + +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss){ + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&d_loss)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&dim)); + OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); + + size_t globalws[1] = {256}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, localws, 0, NULL, NULL) ); + void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); + Dtype loss = *(Dtype*)h_loss; + clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, NULL); + + return loss; +} + +// Explicit instantiation +template float softmax_gpu(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss); +template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); + +template +void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); + + size_t Global_Work_Size[1] = {num}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +// Explicit instantiation +template void scal_gpu(cl_kernel Kernel, const int num, const float alpha, float* data); +template void scal_gpu(cl_kernel Kernel, const int num, const double alpha, double* data); + +template +void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype* label){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) ); + + size_t Global_Work_Size[1] = {num}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +// Explicit instantiation +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, float* data, const float* label); +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, double* data, const double* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data); +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); + ret |= clSetKernelArg(Kernel, 10,sizeof(cl_int), (void*)&pad_); + ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {count * 1}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* top_data); +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_,const int stride_,const int pad_, double* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_size_); + ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_); + ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {count}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff); +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff ); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&pad_); + ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[]={count}; + size_t uiLocal_Work_Size[]={256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL)); +} + +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff); +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); + +template +void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void Relu_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data); +template void Relu_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data); + +template +void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void Relu_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff); +template void Relu_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff); + +template +void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {N}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign(cl_kernel Kernel,const int N, const float* X, float* Y ); +template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double* X, double* Y ); + +template +void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_div (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); +template void caffe_gpu_div (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); + +template +void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const Dtype alpha, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const float alpha, float* y); +template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const double alpha, double* y); + +template +void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); +template void caffe_gpu_mul (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); + +template +void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y); +template void caffe_gpu_powx (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y); + +template +void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) +{ + cl_int ret; + ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); + ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); + ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); + ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); + ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void Dropout_fp_gpu(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); +template void Dropout_fp_gpu(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); + +template +void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) +{ + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); + ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); + ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); + ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Dropout_bp_gpu(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); +template void Dropout_bp_gpu(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); + +typedef unsigned int uint32_t; +struct array4x32 { uint32_t v[4]; }; +template +void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){ + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float), (void*)&threshold); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold); +template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold); + + +template +void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, + const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { + + int num_kernels = channels * height * width * optnum; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operatiors) + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + +template void opttrans(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels, + const int height, const int width, float* data_opt, const int opt_offset, const int optnum); +template void opttrans(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels, + const int height, const int width, double* data_opt, const int opt_offset, const int optnum); + + +} // namespace caffe + From c0ff752749500f0f3b992ec1d0bc6f3fb15c7fdf Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 16 Jul 2015 16:00:22 +0800 Subject: [PATCH 008/124] Debugging layer. Not much change for layers --- include/caffe/common.hpp | 30 ++ src/caffe/OCL_kernel.cl | 2 +- src/caffe/layers/conv_layer.cpp | 9 +- src/caffe/layers/lrn_layer.cpp | 2 +- src/caffe/net.cpp | 864 ++++++++++++++++++++++++++++++++ src/caffe/solver.cpp | 23 - 6 files changed, 903 insertions(+), 27 deletions(-) create mode 100644 src/caffe/net.cpp diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index e0703056..debc73a3 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -105,6 +105,36 @@ do{ \ printf("\n\n"); \ }while(0) +#define CHECK_GLOBAL_MEM_DATA(global_mem, count, num, marker)\ +do{ \ + Dtype *global_mem_cpu = new Dtype[count]; \ + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \ + CL_TRUE, 0, sizeof(Dtype)*count, global_mem_cpu,0, NULL, NULL); \ + size_t sample_interval = count/num; \ + if(sample_interval == 0){ \ + sample_interval=1; \ + } \ + printf("%s: ", marker); \ + for(int i=0; i __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ int gdx = get_global_id(0); if(gdx < N){ - Y[gdx] =((0.0::Backward_cpu(const vector*>& top, } } } + CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); + CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); + CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff"); - CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } template @@ -137,7 +139,10 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, } } - CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); + CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); + CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); + CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); + // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index e49e2963..2dc18595 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -256,7 +256,7 @@ void LRNLayer::CrossChannelForward_gpu(const vector*>& bottom template void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ - CrossChannelBackward_gpu(top, propagate_down, bottom); + CrossChannelBackward_cpu(top, propagate_down, bottom); } template diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp new file mode 100644 index 00000000..4de7a146 --- /dev/null +++ b/src/caffe/net.cpp @@ -0,0 +1,864 @@ +#include +#include +#include +#include +#include +#include + +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#include "caffe/net.hpp" +#include "caffe/proto/caffe.pb.h" +#include "caffe/util/insert_splits.hpp" +#include "caffe/util/io.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/upgrade_proto.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +template +Net::Net(const NetParameter& param) { + Init(param); +} + +template +Net::Net(const string& param_file, Phase phase) { + NetParameter param; + ReadNetParamsFromTextFileOrDie(param_file, ¶m); + param.mutable_state()->set_phase(phase); + Init(param); +} + +template +void Net::Init(const NetParameter& in_param) { + // Set phase from the state. + phase_ = in_param.state().phase(); + // Filter layers based on their include/exclude rules and + // the current NetState. + NetParameter filtered_param; + FilterNet(in_param, &filtered_param); + LOG(INFO) << "Initializing net from parameters: " << std::endl + << filtered_param.DebugString(); + // Create a copy of filtered_param with splits added where necessary. + NetParameter param; + InsertSplits(filtered_param, ¶m); + // Basically, build all the layers and set up their connections. + name_ = param.name(); + map blob_name_to_idx; + set available_blobs; + CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) + << "Must specify either input_shape OR deprecated input_dim, not both."; + if (param.input_dim_size() > 0) { + // Deprecated 4D dimensions. + CHECK_EQ(param.input_size() * 4, param.input_dim_size()) + << "Incorrect input blob dimension specifications."; + } else { + CHECK_EQ(param.input_size(), param.input_shape_size()) + << "Exactly one input_shape must be specified per input."; + } + memory_used_ = 0; + // set the input blobs + for (int input_id = 0; input_id < param.input_size(); ++input_id) { + const int layer_id = -1; // inputs have fake layer ID -1 + AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + // For each layer, set up its input and output + bottom_vecs_.resize(param.layer_size()); + top_vecs_.resize(param.layer_size()); + bottom_id_vecs_.resize(param.layer_size()); + param_id_vecs_.resize(param.layer_size()); + top_id_vecs_.resize(param.layer_size()); + bottom_need_backward_.resize(param.layer_size()); + for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { + // Inherit phase from net if unset. + if (!param.layer(layer_id).has_phase()) { + param.mutable_layer(layer_id)->set_phase(phase_); + } + // Setup layer. + const LayerParameter& layer_param = param.layer(layer_id); + if (layer_param.propagate_down_size() > 0) { + CHECK_EQ(layer_param.propagate_down_size(), + layer_param.bottom_size()) + << "propagate_down param must be specified " + << "either 0 or bottom_size times "; + } + layers_.push_back(LayerRegistry::CreateLayer(layer_param)); + layer_names_.push_back(layer_param.name()); + LOG(INFO) << "Creating Layer " << layer_param.name(); + bool need_backward = false; + + // Figure out this layer's input and output + for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); + ++bottom_id) { + const int blob_id = AppendBottom(param, layer_id, bottom_id, + &available_blobs, &blob_name_to_idx); + // If a blob needs backward, this layer should provide it. + need_backward |= blob_need_backward_[blob_id]; + } + int num_top = layer_param.top_size(); + for (int top_id = 0; top_id < num_top; ++top_id) { + AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); + } + // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter + // specified fewer than the required number (as specified by + // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. + Layer* layer = layers_[layer_id].get(); + if (layer->AutoTopBlobs()) { + const int needed_num_top = + std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); + for (; num_top < needed_num_top; ++num_top) { + // Add "anonymous" top blobs -- do not modify available_blobs or + // blob_name_to_idx as we don't want these blobs to be usable as input + // to other layers. + AppendTop(param, layer_id, num_top, NULL, NULL); + } + } + // After this layer is connected, set it up. + LOG(INFO) << "Setting up " << layer_names_[layer_id]; + layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { + blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); + } + blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); + LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); + if (layer->loss(top_id)) { + LOG(INFO) << " with loss weight " << layer->loss(top_id); + } + memory_used_ += top_vecs_[layer_id][top_id]->count(); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + const int param_size = layer_param.param_size(); + const int num_param_blobs = layers_[layer_id]->blobs().size(); + CHECK_LE(param_size, num_param_blobs) + << "Too many params specified for layer " << layer_param.name(); + ParamSpec default_param_spec; + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + const ParamSpec* param_spec = (param_id < param_size) ? + &layer_param.param(param_id) : &default_param_spec; + const bool param_need_backward = param_spec->lr_mult() > 0; + need_backward |= param_need_backward; + layers_[layer_id]->set_param_propagate_down(param_id, + param_need_backward); + } + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + AppendParam(param, layer_id, param_id); + } + // Finally, set the backward flag + layer_need_backward_.push_back(need_backward); + if (need_backward) { + for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { + blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; + } + } + } + // Go through the net backwards to determine which blobs contribute to the + // loss. We can skip backward computation for blobs that don't contribute + // to the loss. + // Also checks if all bottom blobs don't need backward computation (possible + // because the skip_propagate_down param) and so we can skip bacward + // computation for the entire layer + set blobs_under_loss; + set blobs_skip_backp; + for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { + bool layer_contributes_loss = false; + bool layer_skip_propagate_down = true; + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + if (layers_[layer_id]->loss(top_id) || + (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { + layer_contributes_loss = true; + } + if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { + layer_skip_propagate_down = false; + } + if (layer_contributes_loss && !layer_skip_propagate_down) + break; + } + // If this layer can skip backward computation, also all his bottom blobs + // don't need backpropagation + if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { + layer_need_backward_[layer_id] = false; + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = false; + } + } + if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; } + if (layer_need_backward_[layer_id]) { + LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; + } else { + LOG(INFO) << layer_names_[layer_id] + << " does not need backward computation."; + } + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + if (layer_contributes_loss) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_under_loss.insert(blob_name); + } else { + bottom_need_backward_[layer_id][bottom_id] = false; + } + if (!bottom_need_backward_[layer_id][bottom_id]) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_skip_backp.insert(blob_name); + } + } + } + // Handle force_backward if needed. + if (param.force_backward()) { + for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { + layer_need_backward_[layer_id] = true; + for (int bottom_id = 0; + bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = + bottom_need_backward_[layer_id][bottom_id] || + layers_[layer_id]->AllowForceBackward(bottom_id); + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || + bottom_need_backward_[layer_id][bottom_id]; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + layers_[layer_id]->set_param_propagate_down(param_id, true); + } + } + } + // In the end, all remaining blobs are considered output blobs. + for (set::iterator it = available_blobs.begin(); + it != available_blobs.end(); ++it) { + LOG(INFO) << "This network produces output " << *it; + net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); + net_output_blob_indices_.push_back(blob_name_to_idx[*it]); + } + for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { + blob_names_index_[blob_names_[blob_id]] = blob_id; + } + for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { + layer_names_index_[layer_names_[layer_id]] = layer_id; + } + GetLearningRateAndWeightDecay(); + debug_info_ = param.debug_info(); + LOG(INFO) << "Network initialization done."; + LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); +} + +template +void Net::FilterNet(const NetParameter& param, + NetParameter* param_filtered) { + NetState net_state(param.state()); + param_filtered->CopyFrom(param); + param_filtered->clear_layer(); + for (int i = 0; i < param.layer_size(); ++i) { + const LayerParameter& layer_param = param.layer(i); + const string& layer_name = layer_param.name(); + CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) + << "Specify either include rules or exclude rules; not both."; + // If no include rules are specified, the layer is included by default and + // only excluded if it meets one of the exclude rules. + bool layer_included = (layer_param.include_size() == 0); + for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { + layer_included = false; + } + } + for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { + layer_included = true; + } + } + if (layer_included) { + param_filtered->add_layer()->CopyFrom(layer_param); + } + } +} + +template +bool Net::StateMeetsRule(const NetState& state, + const NetStateRule& rule, const string& layer_name) { + // Check whether the rule is broken due to phase. + if (rule.has_phase()) { + if (rule.phase() != state.phase()) { + LOG(INFO) << "The NetState phase (" << state.phase() + << ") differed from the phase (" << rule.phase() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to min level. + if (rule.has_min_level()) { + if (state.level() < rule.min_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the min_level (" << rule.min_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to max level. + if (rule.has_max_level()) { + if (state.level() > rule.max_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the max_level (" << rule.max_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to stage. The NetState must + // contain ALL of the rule's stages to meet it. + for (int i = 0; i < rule.stage_size(); ++i) { + // Check that the NetState contains the rule's ith stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.stage(i) == state.stage(j)) { has_stage = true; } + } + if (!has_stage) { + LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to not_stage. The NetState must + // contain NONE of the rule's not_stages to meet it. + for (int i = 0; i < rule.not_stage_size(); ++i) { + // Check that the NetState contains the rule's ith not_stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.not_stage(i) == state.stage(j)) { has_stage = true; } + } + if (has_stage) { + LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + return true; +} + +// Helper for Net::Init: add a new input or top blob to the net. (Inputs have +// layer_id == -1, tops have layer_id >= 0.) +template +void Net::AppendTop(const NetParameter& param, const int layer_id, + const int top_id, set* available_blobs, + map* blob_name_to_idx) { + shared_ptr layer_param((layer_id >= 0) ? + (new LayerParameter(param.layer(layer_id))) : NULL); + const string& blob_name = layer_param ? + (layer_param->top_size() > top_id ? + layer_param->top(top_id) : "(automatic)") : param.input(top_id); + // Check if we are doing in-place computation + if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && + blob_name == layer_param->bottom(top_id)) { + // In-place computation + LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; + top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); + top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); + } else if (blob_name_to_idx && + blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { + // If we are not doing in-place computation but have duplicated blobs, + // raise an error. + LOG(FATAL) << "Duplicate blobs produced by multiple sources."; + } else { + // Normal output. + if (layer_param) { + LOG(INFO) << layer_param->name() << " -> " << blob_name; + } else { + LOG(INFO) << "Input " << top_id << " -> " << blob_name; + } + shared_ptr > blob_pointer(new Blob()); + const int blob_id = blobs_.size(); + blobs_.push_back(blob_pointer); + blob_names_.push_back(blob_name); + blob_need_backward_.push_back(false); + if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; } + if (layer_id == -1) { + // Set the (explicitly specified) dimensions of the input blob. + if (param.input_dim_size() > 0) { + blob_pointer->Reshape(param.input_dim(top_id * 4), + param.input_dim(top_id * 4 + 1), + param.input_dim(top_id * 4 + 2), + param.input_dim(top_id * 4 + 3)); + } else { + blob_pointer->Reshape(param.input_shape(top_id)); + } + net_input_blob_indices_.push_back(blob_id); + net_input_blobs_.push_back(blob_pointer.get()); + } else { + top_id_vecs_[layer_id].push_back(blob_id); + top_vecs_[layer_id].push_back(blob_pointer.get()); + } + } + if (available_blobs) { available_blobs->insert(blob_name); } +} + +// Helper for Net::Init: add a new bottom blob to the net. +template +int Net::AppendBottom(const NetParameter& param, const int layer_id, + const int bottom_id, set* available_blobs, + map* blob_name_to_idx) { + const LayerParameter& layer_param = param.layer(layer_id); + const string& blob_name = layer_param.bottom(bottom_id); + if (available_blobs->find(blob_name) == available_blobs->end()) { + LOG(FATAL) << "Unknown blob input " << blob_name + << " (at index " << bottom_id << ") to layer " << layer_id; + } + const int blob_id = (*blob_name_to_idx)[blob_name]; + LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; + bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); + bottom_id_vecs_[layer_id].push_back(blob_id); + available_blobs->erase(blob_name); + bool propagate_down = true; + // Check if the backpropagation on bottom_id should be skipped + if (layer_param.propagate_down_size() > 0) + propagate_down = layer_param.propagate_down(bottom_id); + const bool need_backward = blob_need_backward_[blob_id] && + propagate_down; + bottom_need_backward_[layer_id].push_back(need_backward); + return blob_id; +} + +template +void Net::AppendParam(const NetParameter& param, const int layer_id, + const int param_id) { + const LayerParameter& layer_param = layers_[layer_id]->layer_param(); + const int param_size = layer_param.param_size(); + string param_name = + (param_size > param_id) ? layer_param.param(param_id).name() : ""; + if (param_name.size()) { + param_display_names_.push_back(param_name); + } else { + ostringstream param_display_name; + param_display_name << param_id; + param_display_names_.push_back(param_display_name.str()); + } + const int net_param_id = params_.size(); + params_.push_back(layers_[layer_id]->blobs()[param_id]); + param_id_vecs_[layer_id].push_back(net_param_id); + param_layer_indices_.push_back(make_pair(layer_id, param_id)); + if (!param_size || !param_name.size() || (param_name.size() && + param_names_index_.find(param_name) == param_names_index_.end())) { + // This layer "owns" this parameter blob -- it is either anonymous + // (i.e., not given a param_name) or explicitly given a name that we + // haven't already seen. + param_owners_.push_back(-1); + if (param_name.size()) { + param_names_index_[param_name] = net_param_id; + } + } else { + // Named param blob with name we've seen before: share params + const int owner_net_param_id = param_names_index_[param_name]; + param_owners_.push_back(owner_net_param_id); + const pair& owner_index = + param_layer_indices_[owner_net_param_id]; + const int owner_layer_id = owner_index.first; + const int owner_param_id = owner_index.second; + LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " + << "layer '" << layer_names_[owner_layer_id] << "', param " + << "index " << owner_param_id; + Blob* this_blob = layers_[layer_id]->blobs()[param_id].get(); + Blob* owner_blob = + layers_[owner_layer_id]->blobs()[owner_param_id].get(); + const int param_size = layer_param.param_size(); + if (param_size > param_id && (layer_param.param(param_id).share_mode() == + ParamSpec_DimCheckMode_PERMISSIVE)) { + // Permissive dimension checking -- only check counts are the same. + CHECK_EQ(this_blob->count(), owner_blob->count()) + << "Shared parameter blobs must have the same count."; + } else { + // Strict dimension checking -- all dims must be the same. + CHECK(this_blob->shape() == owner_blob->shape()); + } + layers_[layer_id]->blobs()[param_id]->ShareData( + *layers_[owner_layer_id]->blobs()[owner_param_id]); + } +} + +template +void Net::GetLearningRateAndWeightDecay() { + LOG(INFO) << "Collecting Learning Rate and Weight Decay."; + ParamSpec default_param_spec; + for (int i = 0; i < layers_.size(); ++i) { + vector > >& layer_blobs = layers_[i]->blobs(); + for (int j = 0; j < layer_blobs.size(); ++j) { + const ParamSpec* param_spec = + (layers_[i]->layer_param().param_size() > j) ? + &layers_[i]->layer_param().param(j) : &default_param_spec; + params_lr_.push_back(param_spec->lr_mult()); + params_weight_decay_.push_back(param_spec->decay_mult()); + } + } +} + +template +Dtype Net::ForwardFromTo(int start, int end) { + CHECK_GE(start, 0); + CHECK_LT(end, layers_.size()); + Dtype loss = 0; + if (debug_info_) { + for (int i = 0; i < net_input_blobs_.size(); ++i) { + InputDebugInfo(i); + } + } + for (int i = start; i <= end; ++i) { + // LOG(ERROR) << "Forwarding " << layer_names_[i]; +//Yibing add for porting + printf("Forwarding %s\n",layer_names_[i].c_str()); + Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); + loss += layer_loss; + if (debug_info_) { ForwardDebugInfo(i); } +//Yibing add for porting + clFinish(amdDevice.CommandQueue); + } + return loss; +} + +template +Dtype Net::ForwardFrom(int start) { + return ForwardFromTo(start, layers_.size() - 1); +} + +template +Dtype Net::ForwardTo(int end) { + return ForwardFromTo(0, end); +} + +template +const vector*>& Net::ForwardPrefilled(Dtype* loss) { + if (loss != NULL) { + *loss = ForwardFromTo(0, layers_.size() - 1); + } else { + ForwardFromTo(0, layers_.size() - 1); + } + return net_output_blobs_; +} + +template +const vector*>& Net::Forward( + const vector*> & bottom, Dtype* loss) { + // Copy bottom to internal bottom + for (int i = 0; i < bottom.size(); ++i) { + net_input_blobs_[i]->CopyFrom(*bottom[i]); + } + return ForwardPrefilled(loss); +} + +template +string Net::Forward(const string& input_blob_protos, Dtype* loss) { + BlobProtoVector blob_proto_vec; + if (net_input_blobs_.size()) { + blob_proto_vec.ParseFromString(input_blob_protos); + CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) + << "Incorrect input size."; + for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { + net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); + } + } + ForwardPrefilled(loss); + blob_proto_vec.Clear(); + for (int i = 0; i < net_output_blobs_.size(); ++i) { + net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); + } + string output; + blob_proto_vec.SerializeToString(&output); + return output; +} + +template +void Net::BackwardFromTo(int start, int end) { + CHECK_GE(end, 0); + CHECK_LT(start, layers_.size()); + for (int i = start; i >= end; --i) { + if (layer_need_backward_[i]) { +//Yibing add for porting + printf("Backwarding %s\n",layer_names_[i].c_str()); + layers_[i]->Backward( + top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); + if (debug_info_) { BackwardDebugInfo(i); } +//Yibing add for porting + clFinish(amdDevice.CommandQueue); + } + } +} + +template +void Net::InputDebugInfo(const int input_id) { + const Blob& blob = *net_input_blobs_[input_id]; + const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Input " << blob_name << " data: " << data_abs_val_mean; +} + +template +void Net::ForwardDebugInfo(const int layer_id) { + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const Blob& blob = *top_vecs_[layer_id][top_id]; + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name + << " data: " << data_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const int net_param_id = param_id_vecs_[layer_id][param_id]; + const string& blob_name = param_display_names_[net_param_id]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name + << " data: " << data_abs_val_mean; + } +} + +template +void Net::BackwardDebugInfo(const int layer_id) { + const vector*>& bottom_vec = bottom_vecs_[layer_id]; + for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { + if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } + const Blob& blob = *bottom_vec[bottom_id]; + const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name + << " diff: " << diff_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << param_id + << " diff: " << diff_abs_val_mean; + } +} + +template +void Net::UpdateDebugInfo(const int param_id) { + const Blob& blob = *params_[param_id]; + const int param_owner = param_owners_[param_id]; + const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; + const string& param_display_name = param_display_names_[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + if (param_owner < 0) { + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Update] Layer " << layer_name + << ", param " << param_display_name + << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; + } else { + const string& owner_layer_name = + layer_names_[param_layer_indices_[param_owner].first]; + LOG(INFO) << " [Update] Layer " << layer_name + << ", param blob " << param_display_name + << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << diff_abs_val_mean; + } +} + +template +void Net::ShareTrainedLayersWith(const Net* other) { + int num_source_layers = other->layers().size(); + for (int i = 0; i < num_source_layers; ++i) { + Layer* source_layer = other->layers()[i].get(); + const string& source_layer_name = other->layer_names()[i]; + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() && + layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector > >& target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + Blob* source_blob = source_layer->blobs()[j].get(); + CHECK(target_blobs[j]->shape() == source_blob->shape()); + target_blobs[j]->ShareData(*source_blob); + } + } +} + +template +void Net::BackwardFrom(int start) { + BackwardFromTo(start, 0); +} + +template +void Net::BackwardTo(int end) { + BackwardFromTo(layers_.size() - 1, end); +} + +template +void Net::Backward() { + BackwardFromTo(layers_.size() - 1, 0); + if (debug_info_) { + Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { continue; } + asum_data += params_[i]->asum_data(); + asum_diff += params_[i]->asum_diff(); + sumsq_data += params_[i]->sumsq_data(); + sumsq_diff += params_[i]->sumsq_diff(); + } + const Dtype l2norm_data = std::sqrt(sumsq_data); + const Dtype l2norm_diff = std::sqrt(sumsq_diff); + LOG(ERROR) << " [Backward] All net params (data, diff): " + << "L1 norm = (" << asum_data << ", " << asum_diff << "); " + << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; + } +} + +template +void Net::Reshape() { + for (int i = 0; i < layers_.size(); ++i) { + layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); + } +} + +template +void Net::CopyTrainedLayersFrom(const NetParameter& param) { + int num_source_layers = param.layer_size(); + for (int i = 0; i < num_source_layers; ++i) { + const LayerParameter& source_layer = param.layer(i); + const string& source_layer_name = source_layer.name(); + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() && + layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector > >& target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + const bool kReshape = false; + target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); + } + } +} + +template +void Net::CopyTrainedLayersFrom(const string trained_filename) { + NetParameter param; + ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); + CopyTrainedLayersFrom(param); +} + +template +void Net::ToProto(NetParameter* param, bool write_diff) const { + param->Clear(); + param->set_name(name_); + // Add bottom and top + for (int i = 0; i < net_input_blob_indices_.size(); ++i) { + param->add_input(blob_names_[net_input_blob_indices_[i]]); + } + DLOG(INFO) << "Serializing " << layers_.size() << " layers"; + for (int i = 0; i < layers_.size(); ++i) { + LayerParameter* layer_param = param->add_layer(); + for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { + layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); + } + for (int j = 0; j < top_id_vecs_[i].size(); ++j) { + layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); + } + layers_[i]->ToProto(layer_param, write_diff); + } +} + +template +void Net::Update() { + // First, accumulate the diffs of any shared parameters into their owner's + // diff. (Assumes that the learning rate, weight decay, etc. have already been + // accounted for in the current diff.) + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] < 0) { continue; } + if (debug_info_) { UpdateDebugInfo(i); } + const int count = params_[i]->count(); + const Dtype* this_diff; + Dtype* owner_diff; + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + + switch (Caffe::mode()) { + case Caffe::CPU: + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + caffe_add(count, this_diff, owner_diff, owner_diff); + break; + case Caffe::GPU: +#ifndef CPU_ONLY + this_diff = params_[i]->gpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); + // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + caffe_gpu_axpy(count, 1.0, this_diff, owner_diff); +#else + NO_GPU; +#endif + break; + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } + } + // Now, update the owned parameters. + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { continue; } + if (debug_info_) { UpdateDebugInfo(i); } + params_[i]->Update(); + } +} + +template + bool Net::has_blob(const string& blob_name) const { + return blob_names_index_.find(blob_name) != blob_names_index_.end(); +} + +template +const shared_ptr > Net::blob_by_name( + const string& blob_name) const { + shared_ptr > blob_ptr; + if (has_blob(blob_name)) { + blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; + } else { + blob_ptr.reset((Blob*)(NULL)); + LOG(WARNING) << "Unknown blob name " << blob_name; + } + return blob_ptr; +} + +template +bool Net::has_layer(const string& layer_name) const { + return layer_names_index_.find(layer_name) != layer_names_index_.end(); +} + +template +const shared_ptr > Net::layer_by_name( + const string& layer_name) const { + shared_ptr > layer_ptr; + if (has_layer(layer_name)) { + layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; + } else { + layer_ptr.reset((Layer*)(NULL)); + LOG(WARNING) << "Unknown layer name " << layer_name; + } + return layer_ptr; +} + +INSTANTIATE_CLASS(Net); + +} // namespace caffe diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 87f746d8..33bb5ed5 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -520,7 +520,6 @@ void SGDSolver::Normalize(int param_id) { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); - CHECK_BLOB_DATA(net_params[param_id], 10, "NORM"); #else NO_GPU; #endif @@ -540,14 +539,6 @@ void SGDSolver::Regularize(int param_id) { string regularization_type = this->param_.regularization_type(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - Dtype *cpu_diff = net_params[param_id]->mutable_cpu_diff(); - printf("cpu diff before reg\n"); - for(int i=0; i<10; i++) - printf("%f,",cpu_diff[i]); - printf("\n"); - - - switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) { @@ -600,18 +591,6 @@ void SGDSolver::Regularize(int param_id) { default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } - CHECK_BLOB_DATA(net_params[param_id], 10, "REGU"); - cpu_diff = net_params[param_id]->mutable_cpu_diff(); - printf("cpu diff\n"); - for(int i=0; i<10; i++) - printf("%f,",cpu_diff[i]); - printf("\n"); - - cpu_diff = temp_[param_id]->mutable_cpu_diff(); - printf("tmp\n"); - for(int i=0; i<10; i++) - printf("%f,",cpu_diff[i]); - printf("\n"); } template @@ -639,8 +618,6 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { caffe_gpu_copy(net_params[param_id]->count(), history_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); - -CHECK_BLOB_DATA(net_params[param_id], 10, "COMPUTATE"); #else NO_GPU; #endif From 10f731bf09d3bf2aac96279c5322dd164dfb2d5c Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 23 Jul 2015 14:48:37 +0800 Subject: [PATCH 009/124] OpenCL porting for relu, sofmax layer --- include/caffe/loss_layers.hpp | 11 +++- include/caffe/neuron_layers.hpp | 21 +++++- include/caffe/util/ocl_wrapper.hpp | 18 ++++- src/caffe/OCL_kernel.cl | 86 +++++++++++++++++++++--- src/caffe/layers/dropout_layer.cpp | 66 +++++++++++++++++-- src/caffe/layers/relu_layer.cpp | 49 ++++++++++++-- src/caffe/layers/softmax_loss_layer.cpp | 88 +++++++++++++++++++++++-- src/caffe/util/math_functions.cpp | 6 ++ src/caffe/util/ocl_wrapper.cpp | 70 ++++++++++++++++++-- 9 files changed, 379 insertions(+), 36 deletions(-) diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 86c34241..5aa02be1 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -695,6 +695,7 @@ class SoftmaxWithLossLayer : public LossLayer { */ explicit SoftmaxWithLossLayer(const LayerParameter& param) : LossLayer(param) {} + ~SoftmaxWithLossLayer(); virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, @@ -742,8 +743,8 @@ class SoftmaxWithLossLayer : public LossLayer { const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - - + void ocl_setup(); + /// The internal SoftmaxLayer used to map predictions to a distribution. shared_ptr > softmax_layer_; /// prob stores the output probability predictions from the SoftmaxLayer. @@ -761,6 +762,12 @@ class SoftmaxWithLossLayer : public LossLayer { bool normalize_; int softmax_axis_, outer_num_, inner_num_; + + protected: + cl_kernel diff_kernel, scal_kernel, softmax_kernel; + cl_mem d_loss; + cl_kernel softmax_loss_fp_kernel; + cl_kernel softmax_loss_bp_kernel; }; } // namespace caffe diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index c2e0774a..65a7e9f2 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -167,6 +167,13 @@ class DropoutLayer : public NeuronLayer { const vector*>& top); virtual inline const char* type() const { return "Dropout"; } + virtual ~DropoutLayer(); + void ocl_setup(int bottom_count); + cl_mem MaskMem; + cl_kernel ocl_Kernel_Fwd; + cl_kernel ocl_Kernel_Bwd; + cl_kernel rng_kernel; + protected: /** @@ -420,8 +427,10 @@ class ReLULayer : public NeuronLayer { * the value @f$ \nu @f$ by which negative values are multiplied. */ explicit ReLULayer(const LayerParameter& param) - : NeuronLayer(param) {} - + : NeuronLayer(param) { + ocl_setup(); + } + ~ReLULayer(); virtual inline const char* type() const { return "ReLU"; } protected: @@ -473,6 +482,14 @@ class ReLULayer : public NeuronLayer { const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + +//OpenCL related setiup + void ocl_setup(); + + protected: + cl_kernel ReLUForward_kernel; + cl_kernel ReLUBackward_kernel; + }; #ifdef USE_CUDNN diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index df9e855e..519f15d4 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -43,10 +43,10 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); template -void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data); +void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); template -void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff); +void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); template void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); @@ -62,6 +62,20 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype template void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ); + +template +void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts); + +template +void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top, + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); + } // namespace caffe #endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 8d497ced..8a5d1138 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -1098,25 +1098,25 @@ template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePo template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global double* bottom_diff); template -__kernel void ReLUForward(const int count, __global T* in, __global T* out){ +__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ int index = get_global_id(0); if(index < count) - out[index] = in[index] > 0? in[index]:0; + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; } -//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out); -template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out); -template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out); +//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); template -__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff){ +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ int index = get_global_id(0); if(index < count) - out_diff[index] = in_diff[index] * (in_data[index] > 0); + out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; } -template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff); -template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff); +template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); template __kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ @@ -1193,6 +1193,74 @@ __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); +template +__kernel void SoftmaxLossForwardGPU(const int nthreads, + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + if (has_ignore_label_ && label_value == ignore_label_) { + loss[index] = 0; + counts[index] = 0; + } else { + loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], + T(FLT_MIN))); + counts[index] = 1; + } + } +} + +template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); +template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); + +template +__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { + const int channels = dim / spatial_dim; + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < channels; ++c) { + bottom_diff[n * dim + c * spatial_dim + s] = 0; + } + counts[index] = 0; + } else { + bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; + counts[index] = 1; + } + } +} + + +template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); + +template __attribute__ ((mangled_name(softmax_loss_bp_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); + template __kernel void diff (const int num, const int dim, __global T* data, __global T* label){ diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 4239443d..7799950e 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -10,6 +10,27 @@ namespace caffe { +template +void DropoutLayer::ocl_setup(int bottom_count){ + //create OpenCL related cl_mem objects and kernels + //if(Caffe::mode() == Caffe::GPU){ + cl_int _err; + ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat",&_err); + ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat",&_err); + rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat",&_err); + OCL_CHECK(_err); + MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL); +} + +template +DropoutLayer::~DropoutLayer(){ + OCL_CHECK( clReleaseMemObject(MaskMem) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_Fwd) ); + OCL_CHECK( clReleaseKernel(ocl_Kernel_Bwd) ); + OCL_CHECK( clReleaseKernel(rng_kernel) ); +} + + template void DropoutLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { @@ -19,6 +40,7 @@ void DropoutLayer::LayerSetUp(const vector*>& bottom, DCHECK(threshold_ < 1.); scale_ = 1. / (1. - threshold_); uint_thres_ = static_cast(UINT_MAX * threshold_); + ocl_setup(bottom[0]->count()); } template @@ -69,14 +91,50 @@ void DropoutLayer::Backward_cpu(const vector*>& top, template void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + unsigned int* mask = + static_cast(rand_vec_.mutable_gpu_data()); +// caffe_gpu_rng_uniform(count, mask); + + caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); + Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + + // set thresholds + // NOLINT_NEXT_LINE(whitespace/operators) +// DropoutForward<<>>( + // count, bottom_data, mask, uint_thres_, scale_, top_data); + // CUDA_POST_KERNEL_CHECK; + } else { + caffe_gpu_copy(count, bottom_data, top_data); + } } + template void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (this->phase_ == TRAIN) { + const unsigned int* mask = + static_cast(rand_vec_.gpu_data()); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + // DropoutBackward<<>>( + // count, top_diff, mask, uint_thres_, scale_, bottom_diff); + // CUDA_POST_KERNEL_CHECK; + Dropout_bp_gpu(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); + } else { + caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); + } + } } diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index ce85b1cc..d7b0a838 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -5,6 +5,20 @@ #include "caffe/vision_layers.hpp" namespace caffe { +template +void ReLULayer::ocl_setup(){ + cl_int _err=0; + ReLUForward_kernel = clCreateKernel(amdDevice.Program,"ReLUForwardfloat",&_err); + ReLUBackward_kernel = clCreateKernel(amdDevice.Program,"ReLUBackwardfloat",&_err); +} + +template +ReLULayer::~ReLULayer(){ + OCL_CHECK( clReleaseKernel(ReLUForward_kernel) ); + OCL_CHECK( clReleaseKernel(ReLUBackward_kernel) ); +} + + template void ReLULayer::Forward_cpu(const vector*>& bottom, @@ -36,16 +50,43 @@ void ReLULayer::Backward_cpu(const vector*>& top, } } + template void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + // NOLINT_NEXT_LINE(whitespace/operators) + // ReLUForward<<>>( + // count, bottom_data, top_data, negative_slope); + //CUDA_POST_KERNEL_CHECK; + // << " count: " << count << " bottom_data: " + // << (unsigned long)bottom_data + // << " top_data: " << (unsigned long)top_data + // << " blocks: " << CAFFE_GET_BLOCKS(count) + // << " threads: " << CAFFE_CUDA_NUM_THREADS; + Relu_fp_gpu(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); } + template void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + // NOLINT_NEXT_LINE(whitespace/operators) +// ReLUBackward<<>>( + // count, top_diff, bottom_data, bottom_diff, negative_slope); + // CUDA_POST_KERNEL_CHECK; + Relu_bp_gpu(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); + } } diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 072f9f71..4b091d3a 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -28,6 +28,28 @@ void SoftmaxWithLossLayer::LayerSetUp( ignore_label_ = this->layer_param_.loss_param().ignore_label(); } normalize_ = this->layer_param_.loss_param().normalize(); + + ocl_setup(); +} + +template +void SoftmaxWithLossLayer::ocl_setup(){ + cl_int err=0; + scal_kernel = clCreateKernel(amdDevice.Program, "scal_float", &err); + diff_kernel = clCreateKernel(amdDevice.Program, "diff_float", &err); + softmax_kernel = clCreateKernel(amdDevice.Program, "softmax_float", &err); + d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL); + + softmax_loss_fp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_fp_float", &err); + softmax_loss_bp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_bp_float", &err); +} + +template +SoftmaxWithLossLayer::~SoftmaxWithLossLayer(){ + clReleaseKernel(diff_kernel); + clReleaseKernel(scal_kernel); + clReleaseKernel(softmax_loss_fp_kernel); + clReleaseKernel(softmax_loss_bp_kernel); } template @@ -121,19 +143,71 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } template -void SoftmaxWithLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); +void SoftmaxWithLossLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is not used for anything until it is overwritten + // on the backward pass, we use it here to avoid having to allocate new GPU + // memory to accumulate intermediate results in the kernel. + Dtype* loss_data = bottom[0]->mutable_gpu_diff(); + // Similarly, this memory is never used elsewhere, and thus we can use it + // to avoid having to allocate additional GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossForwardGPU(softmax_loss_fp_kernel, nthreads, prob_data, label, loss_data, + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + Dtype loss; + caffe_gpu_asum(nthreads, loss_data, &loss); + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + loss /= count; + } else { + loss /= outer_num_; + } + top[0]->mutable_cpu_data()[0] = loss; + if (top.size() == 2) { + top[1]->ShareData(prob_); + } } template void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* top_data = top[0]->gpu_data(); + // caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is never used for anything else, + // we use to to avoid allocating new GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossBackwardGPU(softmax_loss_bp_kernel, nthreads, top_data, label, bottom_diff, + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } } - - #ifdef CPU_ONLY STUB_GPU(SoftmaxWithLossLayer); #endif diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index cf9b1ca5..11ccbcc2 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -536,6 +536,12 @@ double caffe_cpu_asum(const int n, const double* x) { template <> void caffe_gpu_asum(const int n, const float* x, float* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_float)), NULL, NULL); + clblasSasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 32a477fc..1fd48aa7 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -105,6 +105,62 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p template float softmax_gpu(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss); template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); + +template +void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts) +{ + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&loss)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*)&has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); + + size_t Global_Work_Size[1] = {nthreads}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, const float* prob_data, const float* label, float* loss, + const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts); +template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, const double* prob_data, const double* label, double* loss, + const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts); + +template +void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top, + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) +{ + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*)&has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); + + size_t Global_Work_Size[1] = {nthreads}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const float* top, const float* label, float* bottom_diff, + const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts); +template void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const double* top, const double* label, double* bottom_diff, + const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts); + template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){ OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); @@ -237,35 +293,37 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const fl template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); template -void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data){ +void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); size_t Global_Work_Size[] = {count * 1}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Relu_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data); -template void Relu_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data); +template void Relu_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope); +template void Relu_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope); template -void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){ +void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); size_t Global_Work_Size[] = {count * 1}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Relu_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff); -template void Relu_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff); +template void Relu_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); +template void Relu_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); template void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ From fc4fa9bdc5a8fa9210a4a1261c750a2e44f629e3 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 16 Jul 2015 16:37:40 +0800 Subject: [PATCH 010/124] OpenCL porting of pooling layer --- include/caffe/util/ocl_wrapper.hpp | 21 ++ include/caffe/vision_layers.hpp | 13 ++ src/caffe/OCL_kernel.cl | 312 +++++++++++++++++++++-------- src/caffe/layers/pooling_layer.cpp | 134 ++++++++++++- src/caffe/util/ocl_wrapper.cpp | 193 ++++++++++++++++++ 5 files changed, 585 insertions(+), 88 deletions(-) diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 519f15d4..49afbffe 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -33,9 +33,30 @@ void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data); +template +void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask); + +template +void MaxPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); + +template +void AvePoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); + +template + void StoPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff); + template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data); +template +void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data); + +template +void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data); + +template +void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data); + template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ); diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 21c72bba..75701710 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -405,10 +405,12 @@ class PoolingLayer : public Layer { public: explicit PoolingLayer(const LayerParameter& param) : Layer(param) {} + ~PoolingLayer(); virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); + void ocl_setup(); virtual inline const char* type() const { return "Pooling"; } virtual inline int ExactNumBottomBlobs() const { return 1; } @@ -439,6 +441,17 @@ class PoolingLayer : public Layer { bool global_pooling_; Blob rand_idx_; Blob max_idx_; + +//opencl related data structures +protected: + cl_kernel MaxPoolForward_kernel, + AvePoolForward_kernel, + StoPoolForwardTrain_kernel, + StoPoolForwardTest_kernel, + MaxPoolBackward_kernel, + AvePoolBackward_kernel, + StoPoolBackward_kernel; + }; #ifdef USE_CUDNN diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 8a5d1138..00278db7 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -747,7 +747,7 @@ template __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ int gdx = get_global_id(0); if(gdx < N){ - Y[gdx] =((0.0 < X[gdx])-(X[gdx] < 0.0)); + Y[gdx] =((0.0 -__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global T* top_data){ +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index += tmp){ @@ -974,97 +974,191 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride; - int hend = min(hstart + kernel_size, height); - int wstart = pw * stride; - int wend = min(wstart + kernel_size, width); - T maxval = -99999999; - bottom_data += (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - maxval = max(maxval, bottom_data[h * width + w]); - } - } - top_data[index] = maxval; - } + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_slice = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_slice[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_slice[maxidx]; + } + } + } + top_data[index] = maxval; + if (mask) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } } -template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* top_data); -template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* top_data); +template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); +template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); template -__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* top_data){ - int index=get_global_id(0); - int tmp=get_global_size(0); - for(index;index -__kernel void MaxPoolBackward(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* top_diff, -const int num, const int channels, const int height, -const int width, const int pooled_height, const int pooled_width, -const int kernel_size, const int stride, __global T* bottom_diff){ +__kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* idx_data, __global T* top_data){ int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - int w = index % width; - int h = (index / width) % height; - int c = (index / width / height) % channels; - int n = index / width / height / channels; - int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1; - int phend = min(h / stride + 1, pooled_height); - int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1; - int pwend = min(w / stride + 1, pooled_width); - T gradient = 0; - T bottom_datum = - bottom_data[((n * channels + c) * height + h) * width + w]; - top_data += (n * channels + c) * pooled_height * pooled_width; - top_diff += (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff[ph * pooled_width + pw] * - (bottom_datum == top_data[ph * pooled_width + pw]); - } + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp){ + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + T cumsum = 0.; + bottom_slice = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + } + } + const float thres = rand_idx[index] * cumsum; + // Second pass: get value, and set index. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + if (cumsum >= thres) { + rand_idx[index] = ((n * channels + c) * height + h) * width + w; + top_data[index] = bottom_slice[h * width + w]; + return; } - bottom_diff[index] = gradient; - + } + } } +} +template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); +template +__kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp){ + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems + T cumsum = FLT_MIN; + T cumvalues = 0.; + bottom_slice = + bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum; + } } -template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* bottom_diff); -template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* bottom_diff); +template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTestDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); +template +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + T gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff_slice = top_diff + offset; + if (mask) { + const int* const mask_slice = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask_slice[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff_slice[ph * pooled_width + pw]; + } + } + } + } else { + top_mask_slice = top_mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff_slice[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } +} +template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* float bottom_diff); +template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* float bottom_diff); template -__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* bottom_diff){ +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, T* const bottom_diff){ int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total){ @@ -1072,30 +1166,76 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in int h = (index / width) % height + pad; int c = (index / width / height) % channels; int n = index / width / height / channels; - int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1; - int phend = min(h / stride + 1, pooled_height); - int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1; - int pwend = min(w / stride + 1, pooled_width); + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); T gradient = 0; top_diff += (n * channels + c) * pooled_height * pooled_width; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int hstart = ph * stride - pad; - int wstart = pw * stride - pad; - int hend = min(hstart + kernel_size, height + pad); - int wend = min(wstart + kernel_size, width + pad); + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff[ph * pooled_width + pw] / pool_size; - } + gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; + } } bottom_diff[index] = gradient; - } } -template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* bottom_diff); -template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global double* bottom_diff); +template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); + +template +void StoPoolBackward(const int nthreads, + const Dtype* const rand_idx, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, Dtype* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + const Dtype* const rand_idx_slice = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + const Dtype* const top_diff_slice = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff_slice[ph * pooled_width + pw] * + (index == static_cast(rand_idx_slice[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; + } +} +template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel StoPoolBackward(const int nthreads, + const float* const rand_idx, const float* const top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, float* const bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel StoPoolBackward(const int nthreads, + const double* const rand_idx, const double* const top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, double* const bottom_diff); template __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 97a5c150..a53002dd 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -13,6 +13,17 @@ namespace caffe { using std::min; using std::max; +template +PoolingLayer::~PoolingLayer(){ + OCL_CHECK( clReleaseKernel(MaxPoolForward_kernel) ); + OCL_CHECK( clReleaseKernel(AvePoolForward_kernel) ); + OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) ); + OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) ); + OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) ); + OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) ); + OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) ); +} + template void PoolingLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { @@ -76,6 +87,19 @@ void PoolingLayer::LayerSetUp(const vector*>& bottom, CHECK_LT(pad_h_, kernel_h_); CHECK_LT(pad_w_, kernel_w_); } + //Intialize OpenCL related + ocl_setup(); +} + +template + void PoolingLayer::ocl_setup(){ + MaxPoolForward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolForwardfloat", NULL); + AvePoolForward_kernel = clCreateKernel(amdDevice.Program, "AvePoolForwardfloat", NULL); + StoPoolForwardTrain_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTrainfloat", NULL); + StoPoolForwardTest_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTestfloat", NULL); + MaxPoolBackward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolBackwardfloat", NULL); + AvePoolBackward_kernel = clCreateKernel(amdDevice.Program, "AvePoolBackwardfloat", NULL); + StoPoolBackward_kernel = clCreateKernel(amdDevice.Program, "StoPoolBackwardfloat", NULL); } template @@ -312,13 +336,119 @@ void PoolingLayer::Backward_cpu(const vector*>& top, template void PoolingLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ - Forward_cpu(bottom, top); + //Forward_cpu(bottom, top); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; + Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->mutable_gpu_data(); + } else { + mask = max_idx_.mutable_gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolForward(MaxPoolForward_kernel, + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, + mask, top_mask); + /* + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolForward<<>>( + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, + mask, top_mask);*/ + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolForward(AvePoolForward_kernel, + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); + /* + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolForward<<>>( + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);*/ + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + if (this->phase_ == TRAIN) { + // We need to create the random index as well. + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTrain(StoPoolForwardTrain_kernel, + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, + rand_idx_.mutable_gpu_data(), top_data); + } else { + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTest(StoPoolForwardTest_kernel, + count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, top_data); + } + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } template void PoolingLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + //Backward_cpu(top, propagate_down, bottom); + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + caffe_gpu_set(count, Dtype(0.), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->gpu_data(); + } else { + mask = max_idx_.gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolBackward(MaxPoolBackward_kernel, + count, top_diff, mask, top_mask, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, + kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, + bottom_diff); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolBackward(AvePoolBackward_kernel, + count, top_diff, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolBackward(StoPoolBackward_kernel, + count, rand_idx_.gpu_data(), top_diff, + top[0]->num(), channels_, height_, width_, pooled_height_, + pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, + bottom_diff); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } #ifdef CPU_ONLY diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 1fd48aa7..b47a0a91 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -216,6 +216,115 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data); template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data); +template +void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*)&mask); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolForward(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask); +template void MaxPoolForward(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask); + +template +void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&idx_data); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void StoPoolForwardTrain(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data); +template void StoPoolForwardTrain(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data); + +template +void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} +template void StoPoolForwardTest(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data); +template void StoPoolForwardTest(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data); + +template +void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {count * 1}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolForward(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data); +template void AvePoolForward(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data); + template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){ cl_int ret; @@ -267,6 +376,90 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff ); +template +void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&mask); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_mask); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_h); + ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&kernel_w); + ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_h); + ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&stride_w); + ret |= clSetKernelArg(Kernel,14, sizeof(cl_int), (void*)&pad_h); + ret |= clSetKernelArg(Kernel,15, sizeof(cl_int), (void*)&pad_w); + ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolBackward(cl_kernel kernel, const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); +template void MaxPoolBackward(cl_kernel kernel, const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); + +template +void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff) +{ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&num); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&stride_h); + ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_w); + ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&pad_h); + ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&pad_w); + ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolBackward(cl_kernel kernel, const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); +template void AvePoolBackward(cl_kernel kernel, const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); + +template +void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&height); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&width); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_height); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_width); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_h); + ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_w); + ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_h); + ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_w); + ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void StoPoolBackward(cl_kernel kernel, const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff); +template void StoPoolBackward(cl_kernel kernel, const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff); + template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){ cl_int ret; From e5dc1d75e1df5f35310c6e6b226ef194899d0753 Mon Sep 17 00:00:00 2001 From: Junli Date: Sat, 18 Jul 2015 11:58:59 +0800 Subject: [PATCH 011/124] OpenCL porting of LRN layers and inner-product layer; fixed some bugs in solver --- include/caffe/common_layers.hpp | 1 + include/caffe/neuron_layers.hpp | 4 + include/caffe/solver.hpp | 15 + include/caffe/util/math_functions.hpp | 6 - include/caffe/util/ocl_wrapper.hpp | 25 +- include/caffe/vision_layers.hpp | 4 +- src/caffe/OCL_kernel.cl | 343 +++++++++--------- .../layers/cufiles/inner_product_layer.cu | 27 +- src/caffe/layers/inner_product_layer.cpp | 39 +- src/caffe/layers/lrn_layer.cpp | 65 +++- src/caffe/layers/power_layer.cpp | 77 ++++ src/caffe/layers/split_layer.cpp | 20 +- src/caffe/solver.cpp | 24 +- src/caffe/util/math_functions.cpp | 2 + src/caffe/util/ocl_wrapper.cpp | 268 ++++++-------- 15 files changed, 560 insertions(+), 360 deletions(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index d2c0ce6d..4e884f21 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -567,6 +567,7 @@ class SplitLayer : public Layer { const vector& propagate_down, const vector*>& bottom); int count_; + cl_kernel gpu_add_kernel; }; /** diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 65a7e9f2..67d5e0b2 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -411,6 +411,10 @@ class PowerLayer : public NeuronLayer { Dtype shift_; /// @brief Result of @f$ \alpha \gamma @f$ Dtype diff_scale_; + + protected: + void ocl_setup(); + cl_kernel memset_kernel, scalar_kernel, div_kernel, mul_kernel, powx_kernel; }; /** diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index c2ced487..8f2767f6 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -58,6 +58,10 @@ class Solver { int current_step_; shared_ptr > net_; vector > > test_nets_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(Solver); }; @@ -93,6 +97,10 @@ class SGDSolver : public Solver { // of gradients/updates and is not needed in snapshots vector > > history_, update_, temp_; + void ocl_setup(); + protected: + cl_kernel scalar_kernel, div_kernel, powx_kernel; + DISABLE_COPY_AND_ASSIGN(SGDSolver); }; @@ -107,6 +115,10 @@ class NesterovSolver : public SGDSolver { protected: virtual void ComputeUpdateValue(int param_id, Dtype rate); + void ocl_setup(); + protected: + cl_kernel scalar_kernel, div_kernel, powx_kernel; + DISABLE_COPY_AND_ASSIGN(NesterovSolver); }; @@ -125,6 +137,9 @@ class AdaGradSolver : public SGDSolver { << "Momentum cannot be used with AdaGrad."; } + void ocl_setup(); + protected: + cl_kernel scalar_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(AdaGradSolver); }; diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 2cbbf1f0..be1dd09f 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -97,15 +97,9 @@ void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); -template -void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); - template void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); -template -void caffe_gpu_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); - template void caffe_scal(const int N, const Dtype alpha, Dtype *X); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 49afbffe..0390ee3f 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -96,7 +96,30 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t const Dtype* label, Dtype* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts); +} -} // namespace caffe +template +void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y); + +template +void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data); + +template +void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale); + +template +void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in, + const Dtype* const scale, const Dtype negative_beta, Dtype* const out); +template +void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff); + // namespace caffe #endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 75701710..b46130e8 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -392,10 +392,12 @@ class LRNLayer : public Layer { shared_ptr > product_layer_; Blob product_input_; vector*> product_bottom_vec_; + + cl_kernel LFSkernel, LCDkernel, LCOkernel; }; -/** +/*n * @brief Pools the input image by taking the max, average, etc. within regions. * * TODO(dox): thorough documentation for Forward, Backward, and proto params. diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 00278db7..0d8328c8 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -966,7 +966,7 @@ template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(co template -__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index += tmp){ @@ -978,17 +978,17 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int wstart = pw * stride_w - pad_w; const int hend = min(hstart + kernel_h, height); const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - T maxval = -FLT_MAX; - int maxidx = -1; - bottom_slice = + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_data = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - if (bottom_slice[h * width + w] > maxval) { + if (bottom_data[h * width + w] > maxval) { maxidx = h * width + w; - maxval = bottom_slice[maxidx]; + maxval = bottom_data[maxidx]; } } } @@ -999,11 +999,9 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const top_mask[index] = maxidx; } } - } template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); -template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); - +template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); template __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){ @@ -1013,32 +1011,31 @@ __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - T aveval = 0; - bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } + int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + T aveval = 0; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_data[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } + } template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); template -__kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* idx_data, __global T* top_data){ +__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){ int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index+=tmp){ @@ -1051,11 +1048,11 @@ __kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const D const int wstart = pw * stride_w; const int wend = min(wstart + kernel_w, width); T cumsum = 0.; - bottom_slice = bottom_data + (n * channels + c) * height * width; + bottom_data = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; + cumsum += bottom_data[h * width + w]; } } const float thres = rand_idx[index] * cumsum; @@ -1063,25 +1060,25 @@ __kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const D cumsum = 0; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; + cumsum += bottom_data[h * width + w]; if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_slice[h * width + w]; + top_data[index] = bottom_data[h * width + w]; return; } } } } } -template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); +template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); template -__kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ +__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ int index = get_global_id(0); int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp){ - const int pw = index % pooled_width; + for(index; index < count; index+=tmp){ + const int pw = index % pooled_width; const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; @@ -1089,31 +1086,29 @@ __kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dt const int hend = min(hstart + kernel_h, height); const int wstart = pw * stride_w; const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems + // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; T cumsum = FLT_MIN; T cumvalues = 0.; - bottom_slice = - bottom_data + (n * channels + c) * height * width; + bottom_data = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; + cumsum += bottom_data[h * width + w]; + cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; } } - top_data[index] = cumvalues / cumsum; - } + top_data[index] = cumvalues / cumsum; } } -template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTestDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); +template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); template -void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, +void MaxPoolBackward(const int nthreads, __global T* top_diff, + __global int* mask, __global T* top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { + const int pad_w, __global T* const bottom_diff) { int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total){ @@ -1131,22 +1126,22 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); T gradient = 0; const int offset = (n * channels + c) * pooled_height * pooled_width; - top_diff_slice = top_diff + offset; + top_diff += offset; if (mask) { - const int* const mask_slice = mask + offset; + mask = mask + offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - if (mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; + if (mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; } } } } else { - top_mask_slice = top_mask + offset; + top_mask = top_mask + offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; } } } @@ -1154,34 +1149,34 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, bottom_diff[index] = gradient; } } -template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* float bottom_diff); -template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* float bottom_diff); +template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template -__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, T* const bottom_diff){ +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){ int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total){ - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = (index / width / height) % channels; - int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int pwend = min(w / stride_w + 1, pooled_width); - T gradient = 0; - top_diff += (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; - } + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } } bottom_diff[index] = gradient; } @@ -1190,52 +1185,53 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); -template +template void StoPoolBackward(const int nthreads, - const Dtype* const rand_idx, const Dtype* const top_diff, + __global Dtype* rand_idx, __global Dtype* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const bottom_diff) { + const int stride_w, __global Dtype* bottom_diff) { int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const rand_idx_slice = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff_slice[ph * pooled_width + pw] * - (index == static_cast(rand_idx_slice[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + rand_idx = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + top_diff = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (index == static_cast(rand_idx[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; + } } -template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel StoPoolBackward(const int nthreads, - const float* const rand_idx, const float* const top_diff, +template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, float* const bottom_diff); -template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel StoPoolBackward(const int nthreads, - const double* const rand_idx, const double* const top_diff, + const int stride_w, __global float* const bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward(const int nthreads, + __global double* rand_idx, __global double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, double* const bottom_diff); + const int stride_w, __global double* const bottom_diff); template __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ @@ -1448,6 +1444,15 @@ __kernel void add_scalar (const int n, const T alpha, __global T* y){ template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); +template +__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index] ; +} +template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); +template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); + template __kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ int index = get_global_id(0); @@ -1490,121 +1495,119 @@ template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void Dropo template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); template -__kernel void LRNFillScale(const int nthreads, __global const T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, __global T* scale) { +__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index += tmp) { // find out the local offset - int w = index % width; - int h = (index / width) % height; - int n = index / width / height; - int offset = (n * channels * height + h) * width + w; - int step = height * width; - in += offset; - scale += offset; + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + in = in + offset; + scale = scale + offset; int head = 0; - int pre_pad = (size - 1) / 2; - int post_pad = size - pre_pad - 1; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; T accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values - while (head < post_pad) { - accum_scale += in[head * step] * in[head * step]; - ++head; - } - // until we reach size, nothing needs to be subtracted - while (head < size) { + while (head < post_pad && head < channels) { accum_scale += in[head * step] * in[head * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; ++head; } // both add and subtract while (head < channels) { accum_scale += in[head * step] * in[head * step]; - accum_scale -= in[(head - size) * step] * in[(head - size) * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } // subtract only while (head < channels + post_pad) { - accum_scale -= in[(head - size) * step] * in[(head - size) * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } } } -template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global const float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, __global float* scale); -template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global const double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, __global double* scale); + +template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); template -__kernel void LRNComputeOutput(const int nthreads, __global const T* in, __global const T* scale, const T negative_beta, __global T* out) { +__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index += tmp) out[index] = in[index] * pow(scale[index], negative_beta); } -template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global const float* in, __global const float* scale, const float negative_beta, __global float* out); -template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global const double* in, __global const double* scale, const double negative_beta, __global double* out); +template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); +template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); template -__kernel void LRNComputeDiff(const int nthreads, __global const T* bottom_data, __global const T* top_data, __global const T* scale, __global const T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { +__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { int index = get_global_id(0); int tmp = get_global_size(0); for(index; index < nthreads; index += tmp) { - int w = index % width; - int h = (index / width) % height; - int n = index / width / height; - int offset = (n * channels * height + h) * width + w; - int step = height * width; + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; bottom_data += offset; top_data += offset; scale += offset; top_diff += offset; bottom_diff += offset; int head = 0; - int pre_pad = size - (size + 1) / 2; - int post_pad = size - pre_pad - 1; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; T accum_ratio = 0; // accumulate values - while (head < post_pad) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - ++head; - } - // until we reach size, nothing needs to be subtracted - while (head < size) { + while (head < post_pad && head < channels) { accum_ratio += top_diff[head * step] * top_data[head * step] / scale[head * step]; - bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; ++head; } // both add and subtract while (head < channels) { accum_ratio += top_diff[head * step] * top_data[head * step] / scale[head * step]; - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; ++head; } // subtract only while (head < channels + post_pad) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; ++head; } - } +} } -template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global const float* bottom_data, __global const float* top_data, __global const float* scale, __global const float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); -template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global const double* bottom_data, __global const double* top_data, __global const double* scale, __global const double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); +template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); template __kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu index dd90cac1..d93560a0 100644 --- a/src/caffe/layers/cufiles/inner_product_layer.cu +++ b/src/caffe/layers/cufiles/inner_product_layer.cu @@ -15,12 +15,12 @@ void InnerProductLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); + caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., + bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.gpu_data(), - this->blobs_[1]->gpu_data(), (Dtype)1., top_data); + caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., + bias_multiplier_.gpu_data(),0, + this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); } } @@ -32,22 +32,23 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); + caffe_gpu_gemm_ex(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bias - caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.gpu_data(), (Dtype)1., - this->blobs_[1]->mutable_gpu_diff()); + caffe_gpu_gemvv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, + (size_t)0, N_, reinterpret_cast(bias_multiplier_->gpu_data()), + (size_t)0, (Dtype)0., 1, + this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); } if (propagate_down[0]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->gpu_data(), (Dtype)0., - bottom[0]->mutable_gpu_diff()); + caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., + bottom[0]->mutable_gpu_diff(), 0); } } diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 8edd6148..03dbbeb5 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -121,14 +121,45 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, template void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); + caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., + bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); + if (bias_term_) { + caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., + bias_multiplier_.gpu_data(),0, + this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); + } } template void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + const vector& propagate_down, + const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Gradient with respect to weight + caffe_gpu_gemm_ex(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bias + caffe_gpu_gemvv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, + (size_t)0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), + (size_t)0, (Dtype)0., 1, + this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bottom data + caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., + bottom[0]->mutable_gpu_diff(), 0); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 2dc18595..d2f1c247 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -3,6 +3,8 @@ #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" +#include "caffe/util/math_functions.hpp" namespace caffe { @@ -88,6 +90,9 @@ void LRNLayer::Reshape(const vector*>& bottom, product_layer_->Reshape(product_bottom_vec_, top); break; } + LFSkernel = clCreateKernel(amdDevice.Program,"LRNFillScalefloat",NULL); + LCDkernel = clCreateKernel(amdDevice.Program,"LRNComputeDifffloat",NULL); + LCOkernel = clCreateKernel(amdDevice.Program,"LRNComputeOutputfloat",NULL); } template @@ -248,29 +253,67 @@ void LRNLayer::WithinChannelBackward( } template -void LRNLayer::CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top){ - CrossChannelForward_cpu(bottom, top); +void LRNLayer::CrossChannelForward_gpu( + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(LFSkernel, + n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(LCOkernel, + n_threads, bottom_data, scale_data, -beta_, top_data); } template -void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - CrossChannelBackward_cpu(top, propagate_down, bottom); +void LRNLayer::CrossChannelBackward_gpu( + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(LCDkernel, + n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); } template void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_gpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } template void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_gpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } - #ifdef CPU_ONLY STUB_GPU(LRNLayer); STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index bc14fffb..98be1278 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -4,6 +4,9 @@ #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" + namespace caffe { @@ -15,6 +18,17 @@ void PowerLayer::LayerSetUp(const vector*>& bottom, scale_ = this->layer_param_.power_param().scale(); shift_ = this->layer_param_.power_param().shift(); diff_scale_ = power_ * scale_; + //OpenCL related set up + ocl_setup(); +} + +template +void PowerLayer::ocl_setup(){ + memset_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); + scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); + div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); + powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); + mul_kernel = clCreateKernel(amdDevice.Program, "element_mul_float", NULL); } // Compute y = (shift + scale * x)^power @@ -97,11 +111,74 @@ void PowerLayer::Backward_cpu(const vector*>& top, template void PowerLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + ocl_memset(memset_kernel, top_data, value, count); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + caffe_gpu_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(scalar_kernel, count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_gpu_powx(powx_kernel, count, top_data, power_, top_data); + } } template void PowerLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + ocl_memset(memset_kernel, bottom_diff, diff_scale_,count); + } else { + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, + Dtype(0), bottom_diff); + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(scalar_kernel, count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(div_kernel, count, top_data, bottom_data, bottom_diff); + caffe_gpu_scal(count, power_, bottom_diff); + } else { + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_gpu_scal(count, diff_scale_, bottom_diff); + } + } + } + caffe_gpu_mul(mul_kernel, count, top_diff, bottom_diff, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 1894d0f1..af8a9123 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -21,6 +21,7 @@ void SplitLayer::Reshape(const vector*>& bottom, top[i]->ReshapeLike(*bottom[0]); CHECK_EQ(count_, top[i]->count()); } + gpu_add_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_add_float",NULL); } template @@ -52,13 +53,28 @@ void SplitLayer::Backward_cpu(const vector*>& top, template void SplitLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ - Forward_cpu(bottom, top); + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } } template void SplitLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); + if (!propagate_down[0]) { return; } + if (top.size() == 1) { + caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); + return; + } + caffe_gpu_add(gpu_add_kernel, count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + bottom[0]->mutable_gpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } + } #ifdef CPU_ONLY diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 33bb5ed5..684c85cb 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -19,6 +19,21 @@ Solver::Solver(const SolverParameter& param) Init(param); } +template +void Solver::ocl_setup(){ + scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); + div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); + powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); +} + +/* +template +Solver::~Solver(){ + OCL_CHECK( clReleaseKernel(scalar_kernel) ); + OCL_CHECK( clReleaseKernel(div_kernel) ); + OCL_CHECK( clReleaseKernel(powx_kernel) ); +}*/ + template Solver::Solver(const string& param_file) : net_() { @@ -51,7 +66,6 @@ void Solver::Init(const SolverParameter& param) { LOG(INFO) << "Solver scaffolding done."; iter_ = 0; current_step_ = 0; - } template @@ -749,7 +763,7 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::GPU: { #ifndef CPU_ONLY // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), + caffe_gpu_powx(powx_kernel, net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); @@ -760,14 +774,14 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->history_[param_id]->mutable_gpu_data()); // prepare update - caffe_gpu_powx(net_params[param_id]->count(), + caffe_gpu_powx(powx_kernel, net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_add_scalar(net_params[param_id]->count(), + caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_div(net_params[param_id]->count(), + caffe_gpu_div(div_kernel, net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 11ccbcc2..7a0e57bf 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -602,6 +602,7 @@ template void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { } +/* template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { } @@ -609,6 +610,7 @@ void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { } +*/ template void mul_kernel(const int n, const Dtype* a, diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index b47a0a91..a1be91e2 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -485,6 +485,7 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); +<<<<<<< HEAD template void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ cl_int ret; @@ -510,66 +511,138 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} template void Relu_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); template void Relu_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); - template -void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ +void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, + const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { + + int num_kernels = channels * height * width * optnum; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operatiors) + cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); OCL_CHECK(ret); - size_t Global_Work_Size[] = {N}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void caffe_gpu_sign(cl_kernel Kernel,const int N, const float* X, float* Y ); -template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double* X, double* Y ); +template void opttrans(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels, + const int height, const int width, float* data_opt, const int opt_offset, const int optnum); +template void opttrans(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels, + const int height, const int width, double* data_opt, const int opt_offset, const int optnum); template -void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale){ + cl_int ret; + ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in); + ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num); + ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size); + ret|=clSetKernelArg(LFSkernel,7,sizeof(cl_float),(void*)&alpha_over_size); + ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k); + ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[]={nthreads}; + size_t uiLocal_Work_Size[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) ); } - -template void caffe_gpu_div (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); -template void caffe_gpu_div (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); +template void LRNFillScale(cl_kernel kernel, const int nthreads, const float* const in, + const int num, const int channels, const int height, + const int width, const int size, const float alpha_over_size, + const float k, float* const scale); +template void LRNFillScale(cl_kernel kernel, const int nthreads, const double* const in, + const int num, const int channels, const int height, + const int width, const int size, const double alpha_over_size, + const double k, double* const scale); template -void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const Dtype alpha, Dtype* y){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in, + const Dtype* const scale, const Dtype negative_beta, Dtype* const out){ + cl_int ret; + ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); + ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale); + ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta); + ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[]={nthreads}; + size_t uiLocal_Work_Size2[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); } +template void LRNComputeOutput(cl_kernel kernel, const int nthreads, const float* const in, + const float* const scale, const float negative_beta, float* const out); +template void LRNComputeOutput(cl_kernel kernel, const int nthreads, const double* const in, + const double* const scale, const double negative_beta, double* const out); -template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const float alpha, float* y); -template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const double alpha, double* y); +template +void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff){ + cl_int ret; + ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data); + ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data); + ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale); + ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff); + ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num); + ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size); + ret|=clSetKernelArg(LCDkernel,10,sizeof(cl_float),(void*)&negative_beta); + ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio); + ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[]={nthreads}; + size_t uiLocal_Work_Size[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) ); +} +template void LRNComputeDiff(cl_kernel kernel, const int nthreads, + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const float negative_beta, + const float cache_ratio, float* const bottom_diff); +template void LRNComputeDiff(cl_kernel kernel, const int nthreads, + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const double negative_beta, + const double cache_ratio, double* const bottom_diff); template -void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ +void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); OCL_CHECK(ret); size_t Global_Work_Size[] = {n}; @@ -577,122 +650,23 @@ void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_mul (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); -template void caffe_gpu_mul (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); +template void caffe_gpu_add (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y); +template void caffe_gpu_add (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y); template -void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){ +void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); size_t Global_Work_Size[] = {n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_powx (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y); -template void caffe_gpu_powx (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y); - -template -void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) -{ - cl_int ret; - ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); - ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); - ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); - ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); - ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void Dropout_fp_gpu(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); -template void Dropout_fp_gpu(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); - -template -void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) -{ - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); - ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); - ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); - ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void Dropout_bp_gpu(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); -template void Dropout_bp_gpu(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); - -typedef unsigned int uint32_t; -struct array4x32 { uint32_t v[4]; }; -template -void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){ - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float), (void*)&inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float), (void*)&sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float), (void*)&threshold); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&nrounds); - ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*)&size); - OCL_CHECK(ret); - - size_t globalws[1] = {size}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); -} -template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold); -template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold); - - -template -void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { - - int num_kernels = channels * height * width * optnum; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operatiors) - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} - -template void opttrans(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels, - const int height, const int width, float* data_opt, const int opt_offset, const int optnum); -template void opttrans(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels, - const int height, const int width, double* data_opt, const int opt_offset, const int optnum); - +template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const float alpha, float* top_data); +template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const double alpha, double* top_data); } // namespace caffe From 5eeeb291ab9538763bfef9c2f5b067913c24ec69 Mon Sep 17 00:00:00 2001 From: Junli Date: Sun, 26 Jul 2015 08:36:07 +0800 Subject: [PATCH 012/124] This patch has conv_org,relu, pooling,fc OpenCL porting and correct --- .../imagenet/train_alexnet_without_dropout.sh | 2 +- .../train_alexnet_without_dropout_cpu.sh | 2 +- include/caffe/solver.hpp | 8 +- include/caffe/util/math_functions.hpp | 8 +- include/caffe/util/ocl_wrapper.hpp | 17 ++- src/caffe/OCL_kernel.cl | 2 +- src/caffe/layers/power_layer.cpp | 2 +- src/caffe/solver.cpp | 10 +- src/caffe/util/math_functions.cpp | 12 -- src/caffe/util/ocl_wrapper.cpp | 143 +++++++++++++++++- 10 files changed, 165 insertions(+), 41 deletions(-) diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh index 5f3d3326..667543bf 100755 --- a/examples/imagenet/train_alexnet_without_dropout.sh +++ b/examples/imagenet/train_alexnet_without_dropout.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh -GLOG_logtostderr=1 ./build/tools/caffe train \ +GLOG_logtostderr=0 ./build/tools/caffe train \ --solver=models/bvlc_alexnet/solver_without_dropout.prototxt diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh index 15625f8a..12d43fc3 100755 --- a/examples/imagenet/train_alexnet_without_dropout_cpu.sh +++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh -GLOG_logtostderr=1 ./build/tools/caffe train \ +GLOG_logtostderr=0 ./build/tools/caffe train \ --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 8f2767f6..a5384a15 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -61,7 +61,7 @@ class Solver { void ocl_setup(); protected: - cl_kernel scalar_kernel, div_kernel, powx_kernel; + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(Solver); }; @@ -99,7 +99,7 @@ class SGDSolver : public Solver { void ocl_setup(); protected: - cl_kernel scalar_kernel, div_kernel, powx_kernel; + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(SGDSolver); }; @@ -117,7 +117,7 @@ class NesterovSolver : public SGDSolver { void ocl_setup(); protected: - cl_kernel scalar_kernel, div_kernel, powx_kernel; + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(NesterovSolver); }; @@ -139,7 +139,7 @@ class AdaGradSolver : public SGDSolver { void ocl_setup(); protected: - cl_kernel scalar_kernel, div_kernel, powx_kernel; + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; DISABLE_COPY_AND_ASSIGN(AdaGradSolver); }; diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index be1dd09f..c2720cf5 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -97,6 +97,9 @@ void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); + template void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); @@ -124,9 +127,6 @@ void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template -void caffe_gpu_mul(cl_kernel Kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y); - template void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); @@ -144,8 +144,6 @@ void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); -template -void caffe_gpu_powx(cl_kernel Kernel, const int n, const Dtype* a, const Dtype b, Dtype* y); unsigned int caffe_rng_rand(); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 0390ee3f..9f2cd851 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -5,6 +5,10 @@ namespace caffe { +typedef unsigned int uint32_t; +template +void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); + template void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); @@ -96,7 +100,6 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t const Dtype* label, Dtype* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts); -} template void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y); @@ -111,8 +114,8 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in const Dtype k, Dtype* const scale); template -void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in, - const Dtype* const scale, const Dtype negative_beta, Dtype* const out); +void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, + Dtype* scale, Dtype negative_beta, Dtype* out); template void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, @@ -121,5 +124,11 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, const int num, const int channels, const int height, const int width, const int size, const Dtype negative_beta, const Dtype cache_ratio, Dtype* const bottom_diff); - // namespace caffe +template +void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y); + +template +void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); +} #endif // CAFFE_UTIL_OCL_UTIL_HPP_ + // namespace caffe diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 0d8328c8..9a299ced 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -1103,7 +1103,7 @@ template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void St template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); template -void MaxPoolBackward(const int nthreads, __global T* top_diff, +__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, __global int* mask, __global T* top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 98be1278..94393f73 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -168,7 +168,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, caffe_gpu_scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff); + caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff); } const Dtype* top_data = top[0]->gpu_data(); caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 684c85cb..2d4b1da9 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -10,7 +10,7 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/upgrade_proto.hpp" - +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { template @@ -22,6 +22,7 @@ Solver::Solver(const SolverParameter& param) template void Solver::ocl_setup(){ scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); + add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } @@ -52,6 +53,7 @@ void Solver::Init(const SolverParameter& param) { //#ifndef CPU_ONLY //AMD device related initialization amdDevice.Init(); + ocl_setup(); // cl_int err = clblasSetup(); //#else // NO_GPU; @@ -768,7 +770,7 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->update_[param_id]->mutable_gpu_data()); // update history - caffe_gpu_add(net_params[param_id]->count(), + caffe_gpu_add(add_kernel, net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->history_[param_id]->mutable_gpu_data()); @@ -778,8 +780,8 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); + caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(div_kernel, net_params[param_id]->count(), net_params[param_id]->gpu_diff(), diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 7a0e57bf..85af49d1 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -602,7 +602,6 @@ template void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { } -/* template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { } @@ -610,7 +609,6 @@ void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { } -*/ template void mul_kernel(const int n, const Dtype* a, @@ -659,16 +657,6 @@ void powx_kernel(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { } -template <> -void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { -} - -template <> -void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { -} - void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) { diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index a1be91e2..501794dc 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -7,7 +7,35 @@ #include #include "caffe/common.hpp" #include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { +typedef unsigned int uint32_t; +struct array4x32 { uint32_t v[4]; }; +template +void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){ + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float), (void*)&threshold); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold); +template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold); + template void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ @@ -485,7 +513,6 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); -<<<<<<< HEAD template void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ cl_int ret; @@ -512,7 +539,7 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {count}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -580,8 +607,8 @@ template void LRNFillScale(cl_kernel kernel, const int nthreads, const d const double k, double* const scale); template -void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in, - const Dtype* const scale, const Dtype negative_beta, Dtype* const out){ +void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, + Dtype* scale, Dtype negative_beta, Dtype* out){ cl_int ret; ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); @@ -593,10 +620,10 @@ void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* cons size_t uiLocal_Work_Size2[]={256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); } -template void LRNComputeOutput(cl_kernel kernel, const int nthreads, const float* const in, - const float* const scale, const float negative_beta, float* const out); -template void LRNComputeOutput(cl_kernel kernel, const int nthreads, const double* const in, - const double* const scale, const double negative_beta, double* const out); +template void LRNComputeOutput(cl_kernel kernel, int nthreads, const float* in, + float* scale, float negative_beta, float* out); +template void LRNComputeOutput(cl_kernel kernel, int nthreads, const double* in, + double* scale, double negative_beta, double* out); template void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, @@ -653,6 +680,37 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* template void caffe_gpu_add (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y); template void caffe_gpu_add (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y); +template +void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {N}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign(cl_kernel Kernel,const int N, const float* X, float* Y ); +template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double* X, double* Y ); + +template +void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_div (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); +template void caffe_gpu_div (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); + template void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){ cl_int ret; @@ -668,5 +726,74 @@ void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtyp template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const float alpha, float* top_data); template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const double alpha, double* top_data); +template +void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); +template void caffe_gpu_mul (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); + +template +void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){ + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y); +template void caffe_gpu_powx (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y); + +template +void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) +{ + cl_int ret; + ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); + ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); + ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); + ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); + ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void Dropout_fp_gpu(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); +template void Dropout_fp_gpu(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); + +template +void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) +{ + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); + ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); + ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); + ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Dropout_bp_gpu(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); +template void Dropout_bp_gpu(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); } // namespace caffe From b72ad4d06b1d32ea053084be1e6fc590cf5404e0 Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 31 Jul 2015 11:20:53 +0800 Subject: [PATCH 013/124] Port the softmax layer --- examples/imagenet/train_alexnet.sh | 2 +- include/caffe/common.hpp | 18 ++++ include/caffe/common_layers.hpp | 10 ++- include/caffe/loss_layers.hpp | 1 + include/caffe/solver.hpp | 1 + include/caffe/util/ocl_wrapper.hpp | 24 ++++++ src/caffe/.OCL_kernel.cl.swo | Bin 0 -> 98304 bytes src/caffe/OCL_kernel.cl | 110 +++++++++++++++++++++++++ src/caffe/layers/base_data_layer.cpp | 2 +- src/caffe/layers/conv_layer.cpp | 7 +- src/caffe/layers/dropout_layer.cpp | 33 ++++---- src/caffe/layers/pooling_layer.cpp | 2 +- src/caffe/layers/softmax_layer.cpp | 75 +++++++++++++++-- src/caffe/solver.cpp | 9 +- src/caffe/util/ocl_wrapper.cpp | 119 +++++++++++++++++++++++++++ tools/caffe.cpp | 3 +- 16 files changed, 384 insertions(+), 32 deletions(-) create mode 100644 src/caffe/.OCL_kernel.cl.swo diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh index e62279e2..58e5229f 100755 --- a/examples/imagenet/train_alexnet.sh +++ b/examples/imagenet/train_alexnet.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh -GLOG_logtostderr=1 ./build/tools/caffe train \ +GLOG_logtostderr=0 ./build/tools/caffe train \ --solver=models/bvlc_alexnet/solver.prototxt diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index debc73a3..07d26556 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -69,6 +69,24 @@ private:\ // A simple macro to mark codes that are not implemented, so that when the code // is executed we will see a fatal log. #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet" +//OpenCL: various of defines to choose the design schemes +/* ifdef: use CPU random generator in dropout layer + ifndef: use GPU randome generator*/ +//#define use_cpu_generator_dropout + +//#define print_memory_trace + +//the following are macro defines for optimization schmes in conv layer +/*ifdef: use proposed img_packing scheme; + ifndef: use proposed packing im2col + sgemm scheme*/ +//#define use_packing_scheme 1 +/* global_packing_N defines packing number of the use_packing scheme + for intial design, we use the same packing number for all conv layers*/ +//#define global_packing_N 16 +/*ifdef: use multi-command queues for groups in conv layer; + ifndef: use single commane queue for groups*/ +//#define multiQ +//#define check_gradient // OpenCL: various checks for different function calls. #define OCL_CHECK(condition) \ diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 4e884f21..a92bb4aa 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -482,7 +482,10 @@ template class SoftmaxLayer : public Layer { public: explicit SoftmaxLayer(const LayerParameter& param) - : Layer(param) {} + : Layer(param) { + ocl_setup(); + } + ~SoftmaxLayer(); virtual void Reshape(const vector*>& bottom, const vector*>& top); @@ -499,6 +502,7 @@ class SoftmaxLayer : public Layer { const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + virtual void ocl_setup(); int outer_num_; int inner_num_; @@ -507,6 +511,10 @@ class SoftmaxLayer : public Layer { Blob sum_multiplier_; /// scale is an intermediate Blob to hold temporary results. Blob scale_; + protected: + cl_kernel channel_max_kernel,channel_subtract_kernel,exp_kernel, channel_sum_kernel; + cl_kernel channel_div_kernel,channel_dot_kernel; + }; #ifdef USE_CUDNN diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 5aa02be1..d1408fd7 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -668,6 +668,7 @@ template class SoftmaxLayer; * -# @f$ (N \times C \times H \times W) @f$ * the predictions @f$ x @f$, a Blob with values in * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of +ss * the @f$ K = CHW @f$ classes. This layer maps these scores to a * probability distribution over classes using the softmax function * @f$ \hat{p}_{nk} = \exp(x_{nk}) / diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index a5384a15..79285a4a 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -51,6 +51,7 @@ class Solver { void Test(const int test_net_id = 0); virtual void SnapshotSolverState(SolverState* state) = 0; virtual void RestoreSolverState(const SolverState& state) = 0; + void DisplayOutputBlobs(const int net_id); SolverParameter param_; diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 9f2cd851..5e86b1e2 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -88,6 +88,30 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype template void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ); +template +void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* out); + +template +void kernel_channel_subtract(cl_kernel Kernel, const int count, + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data); + +template +void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out); + +template +void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* channel_sum); + +template +void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data); + +template +void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot); + template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* prob_data, const Dtype* label, Dtype* loss, diff --git a/src/caffe/.OCL_kernel.cl.swo b/src/caffe/.OCL_kernel.cl.swo new file mode 100644 index 0000000000000000000000000000000000000000..62349bbdafdf55d217551523bd623664e7967190 GIT binary patch literal 98304 zcmeI534B~tz4!yDSQHci;UU5W1(P-<&C&(h0Hw5umKLF`v5wPZk_=5UAv5Vh1)mBo z$mWLno(pdHM8O4pATCc)d?<>jsJI}DiVKMPN7N@E|L^bY_s+eOnWSwHl25;#nS0MY z_nhC^e!t&2_uO^!PC4%QXyKteIXI;JH<*#aL+B`mxFUj!<{+1qp z%-j=~pR|1GiOXJGE9ki@!nWRge}7@_aA9nqF!#*yk)fhFvU#2U(pDbpHQ!b*TbWy5 z7#k@J9ojooE9#kZGZL7Qz=RUm_2kZu_IVQQ1qU4%?eUzGC%j@Nz>EZDBrqd^841it zU`7Hn5}1*|j09#R@WhcoW%RjCuO>TtnJisoK6glb?r45rXg)7aoPSE<{Jtgv8_!b{ z=dVbdf0{Xesrl?MH?-I9Y>vOoeBO{a|8#SFiTT`PC;i1O+^ zCeA-2asI}{`DdE*3(e<06X$m|$B#Cj^Nql4`gSwN+s)_3#QA5L<42j#|4N)Uw`=M# z=ac1wYe)Uk0~FpUL|7Yjb{)xjtF{ew8?%tbe~W=dFB__3s{YeyNE+S^w@f=aqAS zFvO`Jb8d z)&cqaME&@wIdAJzf8xA}cIIP70y7erk-&@uW+X5pff)(RNMJ?+PiP6``tPID0t?_E_y>7Kd>%ds7eha&y5m~OtXZcWx}m3YZS;!nNWRTmyST|6>yAz=RaWLYJDPQ%*62ij zZ0gFj=jOH8aP!tibM&>Ha_u=$u|H~FHC7rQ=_^M^L>=>6qAfm3bDNI3ZZj{}?rzf& zzl}x4WFIF!G z-`Uq{Q5~ttPA%j1MU$6ydup;LFYN}#sP=_E*^Nqj8po*i1wPrnMRirRDBpZsi}GT1 zEy`C0*P_(mR@I_{0o_+6e~(&KX&T3<_8{4fmhm)>QSE^q(x|kjag1u8f<`cnV^n*f z5%?C>S=FL^^KmW8Hy_ucyl&`PRC;G04CsxP@pO(+L9!cdQPVj_1$s!M(w@dKO6f6T z|2Jd1eM%bA+@nhKc_ru%aEI1vO!G7=nHvRYDoAAHz4QPXRV86c|UISv!?*$KI ztKSCShws4&@NMk!o8dY*9Tvc|;kVe}pMiHk1}s^;au1Z1JD7_hR3j>Z-(pP^)L=gp&kB--TVN20X`0!U;w&ccX$Tu0#ApX z;V5kA3U+ZBUJ3ibZ?T8J0b(D&23`rr!v63hWPB@J3)jG>LFC>{y7q+oNyF_RK7dcc zd*Ds50eazhXga4Uii9XCjaPb#BYlNUVhauDH*LrdEmkM=>Bt<@C!727m3&Y0NHja@ z9n6pLL`6B86ZN+6yF8k&6!Sy5zTz;4${12QGSqsxO@C=DYA%jcqF!Q}x0sJ5E{es` zoH@j?MFm{rZ#Z|w$~C#;m!76$kc-P)tP&d?9Ny3e7;ypGQUdgo0&>c&N5+TaX|}hQ zyXZjO(LkY+8yG6BBUi<~X7MaYPI^ltWeG!Se3o0y;We`(5m9Tz7bGjO=FGDv(S}m7 zFVfGUoO~txc7}@aRy!QizuH4&}?`XblS>+_79PU#W~0*Ns;S zxm?uTJe(gH7%KGTM)Jdj=5ndO!cB60rSWw`g_f3}YhXv@WMJ zQ*~~rR4%s?la`LVWGKI`Fx0AKqp~8Ipgu^*p;B*t$ORlLl*flEC-siyE4_orwXwYP z50&zjda|Mdrsu?*N@T$y8PH!u`t%WY~@RHD_1XDx-xhC^5b(Wmakg2-1H<3lw5m9 zmDD(-v#jzx@!d?Ntg2heV>oxHG|=o?d{2?i(RAb$y7k^%Z@yevLUTT%S+((=;@XzA zzGqh*dy#rh&0HK6b^j_~sLSQz*#%QmQ*O;zaOgZ;qja^BZj~z}F5mmg5!e4pry?yN z79!}M6CJMU;C%l^&+IZ9^kE!MbOv%{qK<$tv0RZ;*OyJcuP$zwsJ~{I-4JDu9H!Gv zbkAkF;6jjP+tW)Y-*6{P0&=Pw%hr^-VI_ zGh+G&2_O|F)C&-fu5oe_y+kVoR9CMX%$6|b0ChO3lq&flQ;Ths>R7xM#_FjJS6iEq zP;G6aV$tjH>-37NkK|o z6s4(NeL&KXQ3Y!zA*fVA3d~WRfS{(u6lCJnI{mT_!@iy-UXxZa(FR_|3Q2Y58-FH z8$J&ofww>rPKG03HtYxwVAKB;J_GNAH-OCl=V28r2C@BjhexsPW&Z!Ga21>nXTce; z0``Yp;IH@{Zh`CJLvRU*kK!bdHPw5=PH;bkdOLgy-UDxhSHen=wcj%T|15X}pT+Os zhwx>%5?%`Q|@Cc9C8^EA%jbWUMT zbQ))HI;SuvI*l_povCx8hgG#G-+Ww)^3BK1;P^=$*P>GAL_34Y#YVf$X`I0clig^G zn#LKN&MC}^PU8$t=M?5dr*V+NS?3n;`9W$?Rg3b?$F(Tmd|ZpFcTTi37||PTQPVj_ z1<7u-MNQ`z6(}N&N_!f|sE$C9X;j+NI7Z=6PH9mKt6G$AKCVUi=HptFU)bfwC^fhx zRr`)$KyS2+r*Vu5lig?;PvaQX5$GX}N;`Xus>l9+54L^Lp3nrhW8Z%ru7Y>NS+Ew4g9Bk#_$#*kJ#al- z0dIvf;S@L$#Qy&`cp5y2ZGR`+1fPWqVH5Ph5@?3qLDv8O5bXN@^Z738|IYwf|Gz)j z_5ZhE-^=>{4}q-zAAplU*8lGfJHh?f^*@1cz%?Li5-x;s5Ffx{a4?Ad|2u4ZvH!me zSHf#y1jPRD0I~mfh6l0jZ-#3@?Eftw_Wud61ong`5c~h@Aol;eLDv75L3{wm!(kvk zfZafz0r)Jjq2O{ zpd<3?Tl=Vxtn_iy(zZJv>a3x#@;saO@%RrtB5c~f@?0T{PZ-UQ)JpaE5 z#Qt9bV*l?Bk7C>Z5WWgx|DO+H|DORXV1L*J#Qwhp#Qy&f$o#)N|1Um(Q(+m14`3gV zH2^;cvHxZMU+n*J$irc9Fo^vx&;E=3|7ExmUJD~2_P;#)FZTb=@F2GR&2TM<{V&h{ zi~WBBbc4+Qix1!_@F#40@d4Zj?}ayk*#F0a*#8HC>;Z5uw*B`(?ElL^?Em#}Dzw4# zKPXK5I?|U z*!X{fpMmTzcQt$*WIq7$2@Jx?&lc*WJ8UwFKkvD3SfPk@AL=mJAIA&PJ?5WY2T*c z>exNxa&96Yennv{gh_?mo3F$s2a1a$pB%L=DUwsodZ80*icV(9n2*pLm^;@W8kaR? z<~Ua}G4HY`YY-*lBwG^i7txv}vcW1w6$=aXwTxU0`mo+8q17E-qr_VtEDiOA2g~E5 z;m_hozyFyc36*^x!mSFFdv-;$bxKImy24m%V)5e%C$2g+7e?8fy43cZsEKvemGQBW zhzCbz$xr1>3Wl0(Jx8=~3I@?MYHwk(8y#wp6ds~oh5N+#8 zq6Jehh^A3{3zOYwliGgT8=Yi)Z=3~FFo>qn$FN`u2GKP77%eXvKwtu(>O{k2z;}RnxE4-N(HJ+qtc$vQ7X_00>y7zZ|XBY7}Faq zVqf0~*T8vj4u~y&Hptw251awVf;>Av7iPgd*vQ`ndB$JX&##B0U^kF`%Wr_quow=3 zr-RIqe*><7ewYWl!OyXGZ-KAFm*I=>1yJ%t1oiy6=MH1fgC`ybFAjI~mWDE(sL?0G zCw+oPje_TAa`H&Ge`MZUd*;lP zovp#-I%oy-1hzft)Y+2k)`9dRajuWhVN?~6=WV;IJyh{0B5QU^<}tNIhg5Vo3wsGI zpzw&wad==`c1SjW2OF={-0M)6}n)xy|~^ z5{|Xl$1-Iz4g91_qE)tgl8Tw-I*nJ%x)M*RhY9EB=^5qhGvL43zH!|+#XDwl6|qCl zcww=eYq9}cK4e6-S>a^;xsi7y4;O|R#5?>}30yuq(l-z5tV%$5z}(f>-&!h2sdtDQ zW$j1f)8Uh0Yh*kn`lyyhRbsLpAx~6OotBK+nU)k&^WrH{2Qw2w4V}fmS!snht7%rU z=w?l(%)G3qVe!PR$w*jwZ`0J)Rv77vO8t2I3v7s?|3(HP>07jSJ~vPpDX?)58#?LD zjL@N6$5pG~@!ucKTewT(Lqo-5GPWbCcJe!oE^Ue+ZOTVaF_^7}XaW7L_Pfkg*zYot zpeHk;*nEfiWhO;NhNX*XICm@3Hz?YAnbS2+%#%hFMFshzxAqPA!ZRaSt`H}vph5fi) zvX6|~qr+uN%SLQJ)eX+hx?E-5QJ`)LY< zB!n|*xf7d5%zU3rP`0WWO*=+%>6RvYz4(dM_^7r})3HS@O=ADcJMWeo``^BwO!oA< z5k3PKW7A&*>)}jT0TCPkk6`0}3*HE)!1G{VcnDkmR#*?ug}vZDZ2B+3f51jK7H-Fm z7aRWwm&d|?~UF5T-XbCguh^ue;+;p@@)G!^ubct1Mb5%zaBQg zQSb;h`Hdj^^1Tdpft$(mn?U8g9z#7VjFqw5C}N{BGsQj=EwNn5k5v}yA80K4Af2`B zo~M3@7g@*QVi+_rs81V%K8TH|qBSw7A3g>dgnCPb{{EazL~}cBR@*|(DpRo=8&$r$ zD&sHLH>1YKE;rVXG49l**4C9Dw(Uza+Or^Srk5c0Lj8$F=Z;~?N$29tptK0v+M>c) z<3$vhg4m#Js#h518(UnJMvUy_o=!5{cDOi#mCbH~${9h2)!rK1B$i`+nJOnquh`)# zoh{Lk*yxALk-7Fr7-5)QSFGmN8xyy7R$A=N`YLzlja7HvSo_Y%rMEOZ$_|2R+}9DY zav}`1N*VEC)}^3&DJng&dfPT1AVbXRI#ZS7RUu8v z|GIABQ1o8NMpyQFvj3(>UGEg@(Sw#69!0~&&n4ewHCaNh71yxoQf(x4MPq8Zy^;2@ zsrwX%6P{wa$jvy&oO@XP2tCA`!n5i}s>j&%hALZh>Faah=(|3r#$PQjHxQ_>frdbT zzgZlv;o!`%O>rlzh{E4a_2nwn-W@KrnnB+)7~_N-2A5GmD)B-ksoWG5~In~GZ6lH5dzNu;GCHagH;KV}=LizD2YYdnw> zg1)64+l?Zisz+O(PoeEt$@r{>x$}8d($RXL(F?1uAWTyCsJdkf%1HG+b#BrwmhfI~ z{+?P$HiHL;K6fPqs{xQA_qAH*Fb&%V(hb}Bv&}kBk$Sdbl2|u6VYi+>x?7Qio#Eqh z;IzqaWXw!=el;+R)>c(A#?ROgA0I#;my(U8H&eW#fQbEnD0b};#{Ta%HtwgH>;EXc zA6^Hq23ZFn&-4EZ`~H`3EBq6?{^ua;@ZSk#koEVS@O1bQHvXqT-n+LHc7g}6^KXT# z;bM?A_J_jXvGsohvZnqW@Mbt4UI(v*5&QtM9{vv6DZ z(AGs>%Rf*x=ITqHM6Loccr(U{hDNkH%+|D_d|R5bHvRPsai;6pN&|9k^LTzF)3TM5 zF_F8urPxRpPj(&h-IlV9rHnqcL33Z3?30yBw!J^iLJ1RQWHNzuo^7kkW28DedKhsI zR@+TyV)jSWFj`WQsW(D3Hx%&H;Z!&aTYojo1=)9B_T0M_J_xec{&DbB_%3$-n_&&K!(G_*?}jsA zAGi%0|9sd2WwH0+BzPILVDtYIo8CU(E^G4-hMnLJYp=FddlVaA=Kkf` z^o?*9ltATIXLY;%Uze8(wYtLUjB-ZXv8OY-qFOa?s{N4b@4}mqp0J7XGvziMHS=A% zw#Vu`Cwt^FSMlR5!76hy)VW|Z4_Q;7M;`B6@UsjuzoGBS7d^vm91r#f(+9LZ`BOew znRV9VGiu7nQL-H;Hr&$5A7a*OrPR0qTU zWF-}LE{}gfg{jqYdK`@Dx?2nU(sm_Su-Is3IT}_HV}qWCoLP=$#=)S5Fkm!4d6uJj zW80_!ERzl1Be<cic z%MctY4Bj3Ok0P-~7L|E9t1&FgO0Oa2aP0e6m2#Meq`sQ=Y_u2h6J(qh`+sjN;Jp*} z|38@Lm-qYq9b}LHv#|MB!5sJp_Wt+bU9bk81HZ=RzYflYP6) zwW2h(F+bK98oMS1@x}A&N|j1!I47QmaI9jW3V(VV-&gB8tm2pi!}!AD3Xhw59pB2r z)pdVwTam(R%`w`94=2?o7MZF?t`P$NFpv zW1-#jj=j14j8F1#UvveT>gkSx;_l%f2}?XMAVE1wP#F#bPq!rB{7K@%h+-kv@;I-) zoqwQEu4BZJ!O(0vqXu0ZNb)Qrx{i8`F%0iQ%lM}in8l<>o?OtO2gA@oVo)@iW#zmb z5BS~d9}Tow@?&LqX?K&!pi&H+Am*|=xjm8no^A=@lmo6!*UctY%zSI8L#+z*p`dT8 zN23a_RHfq7pxSy(Jvx(B`qGq6!j>Tib*L_`pVjxlzARR)>sgn4=U+MT$Pe3 zR`2~$k=9;up@QhhV#_kjF3#~K7stnj_S%Cs+C)&{Lp|h)txowKKhURwQ8{g-HX&6n zJQZ&g_P(0x%2K-Q`LN`Iq*;YQ&AcW33sdz&IW;qO7g7{f4SuHFhoi<Y_jOB|X$7JjblM*y+F4cCU ztA=?83MNyg3k5^q#mr9$qJVw(tn*CVyRXEp3@OP0^iGtN<|q zddG*$);r#PrTYBrGX8EDV)>EnQn~_LCQWXyHto1jzA<;#j#7 zWSGKI*$aw z;v;F;QVA}tZK}|Iv^zS}QrUJ+--d^aY*VFoXuBFK+itWVi?&+{6#l?MnTp3Z?q%2d zQ1uLLJXGxg{E4hn1#M^ohrE@79fIrCtm2YZgUi&4!)O#i{~a1_+qH>!>1`c-%Evd+ z^hlao<()@vWqhRmY>|2GC%>T}eo;MLsAfR*sG!5gwJZqelB-A!`D{;i2jwv~b<_WyoZy+;}Q|7c?Y--b;ud;f`jUx3BX z1hUt^ytnT}cs~4%x&N=h2+00_N5NmP|9=9XgxA5zuroY{PvHOH+wd(o6@S28*!OqB zx8XdHz51UCU&FQ^gC+29@E7d(uY%b6{|-6mgh!bB|6lkdtb_gFDR3<|{}{Xi_5^tb z;C{Fj-VZ}C7akG20WHNzAz)ICs3pIB;YE@x=Er~|0mwc3scD6L> zOZ){=Ek>=I90h_4tfPE1@RUmf5Pue--B%WJtL^)B#ZOF7`1d>(_=F&0tjX4 zdojQNLE6-W5tCV}Fx$@R2eb8SA}3^L()id_-JnQivqF|z+LadW>3?=KQX3bB#Syyy z{Wa7Z(}7dV%(jItLfJ&nOmzF51$5j=Roc|uNcS2+N(1$s{SfV<4xdsv#uke)iNuoX zF7ipGe!CqOO^iONoV1%Du&9H`tCDJs1J$N{QbTJ%+rOC`9{7u5`hWbY>LnijGHJbT zort>3Z!?rkZ>Zr1H3W07p9zCVwrbLwH}^8G>3XMfmL5ROG3ef8V&gzJ@Je2uk&NU| z=8Y;P4v_w=hes!v%8+QhAoSXYn6qA=v?Cc^i*0rODqAZ<^{27v{iZ}R^irbg19jhuv{aQ@tNSa_mB8*Tpy{6PDp`-J?>+4!#V|%bxzZPMN&fG#d zbM0-y5+vl4*X(fRJz1-wT9_G@MRma1Qh(xF7f$ABwu5v8sx`3Q%9{x|^Dq;}3uOiq zl~S~>z&6W#<~NksW3jKefwxD^UAKAe*@dxEG&)vV z$8OJMNnuu`en6>z;YD!GjQdi2==hss+DtfNW~^BCGR$7H8Z}ADrKl*qrE%Vtt)2JV z&3PFgvQ!e{{=1+Ngd_D6$#H2rNpwN6|BuGTJqUYO=KtG`z56BR`(@w1jUf8~6d(sn zU=b{Ue(e8uVb{yPf9F9bHvO~VKJ5DI;35zke}9nu{AJF6H9U-ce;vFBL#Pz~A9-Fo11;8)^Rmd=cITJ)rV4f$d)3X{o%v zn%D1|Ymy7ycy&x=^JpQ}Wo*uPHBR+`X2`Fl1y<>MDyAdqeCvsEY1$i1;A6JIYU>>_ zMF@32Y(Mo2E|9j?u7d+H<1&ms?5o1#PP-2xnYzspM3yvrtPE^7B34hG%Y+j}lA#PH zLe(yjP)&N07Dz%F*5#om@U`9-l$6&Rtqf?UBpnPx9f`)S?}@=W|jX zxY1YC^-NEb4QoG@b7WwAtl<17BHSWxzLJNc62ZOKReUjZxM}rW{AG1IYxQNePnSxO zs!hkbGb9xDV!GzpT&8&u?KwX>h2BK}{*j4%h{O{{VU=P1^tbaGI+x-^uEdmNg;gox z)2rbyjHk=``poNDbh<+Sp-WeW-_WK@DrN&7!tk<&e1zeJF}*tBB{a=2Wy||2f1z!$ zrf#I@)M=}i#ml!9-F|U*RYSv(ogROZ4n<^NH?&~LnaVN&vwGYrXIVO2B3NyW!Rjxz z?+T0~8jj6?%&Y`w>_s|va;XmByOkOBZz#_@nrgjeF;Pqz>R%#M?ZPy=k1rOP-wVnu zyF{z5e6kj3s`@DUV8hCzS0ep48;u<+8c`!d1wp1YZL@1=PFC!H%;=^Fn_2Atql{5} zJ@);#`={$KV6TnY2xH~0nQJ%F!;m%?-4 z2lxp79Y$dtEQAHn4sCEP{({YL6#N5!!S!%1JQZ%lS8yS$fTzJX@e|wxp96U>z(RN- z+=H*+F1QmGz+1@VZ}AoUM*IbEDlCHi;ft&TxE?+ae?$&S7Df(kfqbmx@}A7${K$Yj zvp$j^E;OH5Sb6e9y>J*NXthZXl{wd1~ieuxFS+G#BY3(MVx{H{@+7 zA1Xn0}Hm^Q{~md+kep_>wU_rPGO-QR13-+6d}3t=9tZS>zYKO z8`3Az3Vwxtv|rbt>Q!!%7ZaqeS{#fvK{a)6^NmZqp&DvZq3XwE)9I>VkW60#TceE1 z*Va~lzp&Mk*(KE@nNAK;6*3cQTZwHCvaN*uPD?n_|2*8Cm>n+@UK+d=s{h1QMpg76 zd%BXhGUXFbPeuio5!9=jezykKXl(7%fkF*jw{w4S!#+c_f$-*&^jaojsb6DO7gpsn zWp#2~1Wv>Ej^vtyT*4)t)j=eJEz#kQ2s#UEqv~Z7UhXM{X4>Ktz06EDQ;IAmfGL)k ziYwMrMrm^mOH{b@0%On>l~vU^>b&|3nv(5WSW;vb7Wy$%!iI`(+HVz~T~5UkLR?Xm zVsEZDU#=`+z&)bbN+?UmZ3!2g>DXLS>T=D_ZZcPykAA!JLA`4g^>zNTh8%`+u`T4R z8Tk5rhAyV=CY>t(zm?BkHvSek z8wOw=%!2P=>wg^H1_Q7Hc86Wz8L$h?#{NH+cqQ(Qa2hOy{b66&2lfVeU%*#L!+FpL zOCf^Cu>bFeufYW%dj$5wI*@$=bFdQjg$MBsTnqBvK=Bov2<`9)K7zl)U*RFR2i^yH zcquf)&Tub2gCE1yP=-a&0eix;VK>+no(cblKS7>FconRMeL>bF+zwxYx55DY8{97b z1$ZA^1Pfsg_&dIa{|66%JeP1EoDD7T2tJ7a1$hoZ{-mV!pHuRjTEGgLa%{|7$K+Qu zm@nsw10$ueLT)I(j_##Z*WuWqCL61scT@VNw1+;~s6xJ!M;U$dL%0-Jr0N4F6Cvxf zl8>;P^r@SByVV{omCM0u)tX9Yxzt}7&Tq;MaZRp&G^e77=`dP_(ZN;{%2*o5o^Uc& zJ~KUbe*xZKNhY)gqW_mHDRaK;^_F0}AfIFTwx-#vV&mqqE=*G9ub(?q8o)QSso6=T zM;;qMgbcO1EYwOljs@c7TzRdRapFM^m5^jYQ~5m#XBhE)bWR7FQRy*LQJ~l!sHTMVa9Z-n~3cV%O@+ zR%GlLwnYV86UjZC=BawLT9r=q7A^o!p7`!h8FeRl8PA=}p{tCz7g_6M4alq@`S4B3W*h#8Vkb*%vnivR4TYMzmB(2g&g* zinPUB6bGbPsA8$BLFvouZWP)sle=ushB5WR6oIHx48$mf8DET$C?&EPlv1*!lhd(D zXLwa)>(~mcx00rU_(oP!naPSV%axW9KFE`Kjf(TlmJ?tEVrtd<5 zu^xKUZnBmzPC;n0)PK9>&??Mf2F)?4S`16U;1_m^bA6|XRJK#67zJa^HtH2s5pU&Q zu{^GuNL2;(?IY+KWiYSVhud1I+rIt9bO)v7)R#sY9C>C~9_MM2Y6~J!6A2pRBCAy? zJ)>qGf;wgRq!GB09+2TufRC=dj;~aAN&>D z{T6r+oCO0=fIhef`+Y6E9FBuOVz=K4pNF@=C2%o3gw1{moD4@B`~64A^M8yD|3N;L ze6?sB{NL-^o2}k}f{-5?9n9;opJqgVVZG*TVA|3ohvcvJxEmik^L>4}G7lc-+3MF< zX(k-?54MevTgHu&%^8z%_0Es-q*rExl&hemAWIKc~P4P49g4Ej3o9k|RlDLev zHeuRKv#(B?IT5DJ9Fl~oF-RAkD{RWzmU~;Y&V0=*`G(ddOqdPEhw3bFFibu_v#LRT zowQWC%vvBPW7WW$Il4q`we(iJEon2ZH;{;o(yoyPbwza&Op%6~g)UX!m_r@I==;i7 zu@=KJ(YR6W^$r8pjZGWJZZN(LIs@j6LZ`NF_Y*$c-(dexLkg*VgUWpR-n;b`i7bE0 ztQT@Jqh3g-r+k>MzX)qYN|&HqD-CoDBrBt)G?c2Kk_3q|M{Syd9wDvni1$Ug7WItD z+6Z-3xiDw>ZA*|xi+1<9&STc(s+5j1=zEa#)~;p^iIBcWs#zdDj%d-P*Ssd(4CQ|{ zHpU#{_=`k0bWT8@Q5`yFIAu5mgBw&;zl00V-Yu9H(tdZPZQ5E~r=}b8ldoeVAE)Nh zF>2M&O!j*Zpc2kdl+1k-=%x0iP< z2L-4cx!!-IudLexMO&KeoJ5mJrB_c?lgjBCl}XM?_{eIyG`Y18S!+fjFx}nclW{w$ zR-;}sfb@b*cg(8+BzOKE&S#2tKo?)_IW@vaX^M1$rWv~J#w74m+M(@NeH-LRP+QV^ zuBmi9#I72TQxy`91D*M*E;gn2$>^w4`u`>!sr)aZigm+P!V3adr6O$#$M*IagMgmQ zak;JO)rj@=%!&HGK2zc=^rGIXo?bowk7@4K|MU2``rCxvFZ=!958nrw|6c>$@NC!( zHe>7G#l?5Rx8Y1U1`dU%!Bb!-I0KvhYHawc;1lp}I0p8Io#75_`cK0r;cD0iWDfrv zI1^;v{s-_CxB%V=vhM%sAo~P74Iac^KN)0CzzFWbUjGF=gnhmX%!0pSo4*Kl2HErP zI(R2+fR{i7$v>$KzDeA) zOH?NyHM-V|OH?E3lJ`UVoq&%bI1M*DOz35F~4p ztssaz*=u7p#z(HSp)jT%srDy$L@kySF<}ZK4%g0tUd0`YwY|iH9>JNf*bs|!{2U`>@|h}-5r!(r!JL>;x?3?>s)eiLfKx*t?k zm)&@^pSWT-Sf@?WYFFvn>320c>|!#~KT&mUS)8b;woZmmYi#rSSv$X~dQMzaS#>X? z+NJlo$vMT*(#Gt08#x(s1XU%{cC)w%bOD+Al=NrYa5!S4jP{V4vW#jpal%X;a_Q3b zB$+G|N~J33(CC)}7cn<$wr=89?LVYVph5V*AhZpMA(wD9rgU`@LnGbY#Q8=kaCuI3 z9cKBH>nxQ5yT!4mac+1#+pipiiPxS)-)k#H=wk|NQQQ}9L)Sma5Tql^sTc%_Cp9^W z`udc}ggFs6Qe|Y!kR;UjX=D{qY5AxuSkO%WUUJGQ)HF-Q+pUvDDE!nC2iO|fN;DaPu zWLW5STFBMi#(M{UpRBU?AyQMK%C;&YL5a)!|9p(zi^TS2{(rSGfWLzM|DSL^6hYnr z*akboec1l87r-Y#o()(A^Pvr%1AoCU@I`nZjKE^(gx}%|_$k~5{{xr7CU`OI1-rrx z_zFHN{sOoX{tG?|7vm>53%|f9tc5%93w#OQ3niEbhk~r@mv;nS3|l~a0`p-nxEFuG zjqq+LK^yD>GOzz*xB;#Nnb$uDR>6TF{)0c`Gx!Q@fwiz2j)PX%4Zex|J`PHbNhd#xw8?v*tyXWtkg3OL>)lu_dO~MYGI`?>h|W(_p+&KZj5#NrP$+cVOUHY z^<)?mC$Tu9wkC;x<6sdpNTcMH6#QikEAn<_V%`Ai(wrfcG$E#zci8$acrJmoVg zwF)P-_Qg2<(qBf^nI zZbh$ktlHF;9_=V5Iey2a7HmtAD1SnVw5H-_rl6gJy+-YJVPd*pb+2M;@wRrwn}(r$ z3f2C2b2e??I3?Kz8jH)gs-39~#RZu_Hl}`WYtiUNmLVB^GB#Ubl4S*xvV=h38ReWN5QGXfyFdmr`8fXn;sOq3kD)=(0+e?i95Auxd>< z8Mb66qq?iv(xD;s0wrC+^OdNFxyQp>Bc7_?xK>pQtKjPpPUZG0sY`-W+UrvAwPueE zP*bzFch#SX7n5manED+m{61mZNFf*d|ApAE`(WdW{lCE2syAWhp9>q{#qd1%3AX(O z@G95<{0;uD=-f668AsS} zop;?!;t!7BMx{$G2-2m(O}ua?vmBY;jaNmCnAGxhj?FY07q^yLWSWkgraG&SgH52I zwz^iN>%DEXwmAnOZNpG8#n;-Y4pOH>-zm7v-oH@E4Gfjm@q7_WY}=LpfgVIEDe%Y7r#E0E`5_mn`13}d>ne1w}XA$cBkPo-4et{Z)GEpsVwW5ptDu;Kh8Np zxL{Vt^b!&qfAY!|>$A*^AnC?zQkSa#Y%R~ey9sPlN>;POM%^Z~m4LpHEqixi%=aQs z21pf1KkKs_$Uq(#sY-b~Qo9f4_V&<7j%P~QP5$~yyp2j(?El@d^z!%s#Qwj?Sbeu* z?|&EG4ljr0a14kaU|)C?oBv^uy#YQ4Z-o-9gcf)j{0bky-EbFN1gFEPAbSM701kpb z;|ur-h;QH`*by$nH}G~i2l_#tA6O1cp&1T_J>W6c1l$kz!Y|=_@FlnsJ_hfE*MYoe zpag5-IFNS@%!e1iv)~bY5B~?hfgghGHz>~|ybt~p-T<=K;7eg4JPRJiC-H0e9()Ym z0k476VL7zH^WZ`J6?Yonh3rLe32cS}bVCd51%JSo@e}wOd=iJO}=X zZ{zoH1Bl<_U2q<}99|4v@Jx_*B+4EJx4;!3`x2fDr@+zB3j4sH@QwT&Zi36sTw()H;AY*6!k)=i=j7Jihj|L45OEeDhpv5{XR4@D3phx1qXpcNn(CMYP48P@W9mpvb*ij8 zCNJyu)KpJi*6pdOPL(z8HQUEGD(meyGLb}4zx~j}GQcL=rw8MU9#& zN}_VawHj3mhgeGqzl3Btn`CWj0u5+q2=JRM}73qt=bK^C;q%nHnrsY;fUZ#moC z!~%)f20pD4ZgEXH+8_#lgnV5klyZt}VaelHO!Jyf)TDCG;e%hMlt4=y@^{S|MD3>4Z%Z-ka#{TH=h$kc1>qb5Q|1<2* zk7KKf{Xf^(aDTwYzY^XI6?hpO4=;t6z&Eh(m%>pn7k-B=e-((GFM9zV2hFe-+>cFv z4ak0gmw?RwzX{$5V()(mTYdrj8C(2Ma0h%GjAbSBWgdIWb z>94>?;A~h8JHS`4i$4snhjBOn9>y-d3w{beg4^H+unu;IU-7)aC*eYnKke<4|J`h{ zag`}ErdHPIx@}ZmLo)#*K>WS!+{O(07vbMBHSu0n)3(m|w z|7=k%TClBje!6ih?5Shcf=hQe(WW{<(C~49V7nph!Vb38vFn9x^5r&eqA)h)Cw8jM zV9=_V+)mpcXFKi`LkiNG6NeK`Nsi3y^FOoCe`$neB~sC|9^~+Im%*MUQulqswmVAM z2~yQO{dWBJu(S@dO*R{-`>pe{rbm-qdAJerq!Q3KRLZBMVN!9Kp(vIibIk0{S>N~+ zW=dY}ki9o-b;w}$S9Lp9(5^W3BiMSu3_eFk?%_t_;u{ zOWI#S1!7E@RGpzQWj;`x$W84R7uY@ho8m2Ed*>xXMVnk!0;MewwSUYAiRhu+PK|Rw z{9CHI%)BVW^zwAwy4uG3a8Z=v>zXE4NUqukLyS8X^F(Bf358AByVaQ_5hSa>4Ek-i zYeB7t>}n*UcHNy+8iE=edq7Si3p#bJkp|kV8WNN$TU(D!Ex;Nn4eLVo^Jp^v{{Xh} zEyn(rcLB)X7Hs^3;6T_7zJ-l{5o`q62e1?5S^u5jr`Y)Khdy`(90g0@a5xMW!9rL7 zGVgyMHvbi{8Dzh}7lQZ!euEF-i|`g$20Mbx?cW5~LLR!|AlMJ~f#Rj0Zh@U9Ww%i%O$bIw#+o zJoBd984tR%a09ECUQ`?r+8JRT7QFG*J1qDScUVZjq|Ni0cjY2F-l*+;ypQL1 +__kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); + +template +__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } +} + +template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* channel_sum); +template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* channel_sum); + +template +__kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); + +template +__kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } +} + +template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); +template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); + + template __kernel void SoftmaxLossForwardGPU(const int nthreads, __global T* prob_data, __global T* label,__global T* loss, diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index b768f05f..fe3e4c25 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -103,7 +103,7 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo #ifdef Track_data_transfer #endif - CHECK_BLOB_DATA(top[0], 20, "top[0]"); +// CHECK_BLOB_DATA(top[0], 20, "top[0]"); // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index aa2debdf..960073f2 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -96,9 +96,8 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, } } - //Forward_cpu(bottom, top); - CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - CHECK_BLOB_DATA(top[0],20, "top[0]"); + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -142,7 +141,7 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); + CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 7799950e..d08805d1 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -13,12 +13,9 @@ namespace caffe { template void DropoutLayer::ocl_setup(int bottom_count){ //create OpenCL related cl_mem objects and kernels - //if(Caffe::mode() == Caffe::GPU){ - cl_int _err; - ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat",&_err); - ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat",&_err); - rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat",&_err); - OCL_CHECK(_err); + ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat", NULL); + ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat", NULL); + rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat", NULL); MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL); } @@ -96,20 +93,28 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); if (this->phase_ == TRAIN) { - unsigned int* mask = - static_cast(rand_vec_.mutable_gpu_data()); + //unsigned int* mask = + // static_cast(rand_vec_.mutable_gpu_data()); +#ifdef use_cpu_generator_dropout + unsigned int* mask_cpu = + static_cast(rand_vec_.mutable_cpu_data()); + caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); + OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); + Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); +#else // caffe_gpu_rng_uniform(count, mask); - caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); - Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); - + Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); +#endif // set thresholds // NOLINT_NEXT_LINE(whitespace/operators) // DropoutForward<<>>( // count, bottom_data, mask, uint_thres_, scale_, top_data); // CUDA_POST_KERNEL_CHECK; } else { - caffe_gpu_copy(count, bottom_data, top_data); + //caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data); + if(bottom_data != top_data) + OCL_CHECK( clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)bottom_data, (cl_mem)top_data, 0, 0, count*sizeof(Dtype), 0, NULL, NULL) ); } } @@ -122,8 +127,8 @@ void DropoutLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); if (this->phase_ == TRAIN) { - const unsigned int* mask = - static_cast(rand_vec_.gpu_data()); + //const unsigned int* mask = + // static_cast(rand_vec_.gpu_data()); const int count = bottom[0]->count(); // NOLINT_NEXT_LINE(whitespace/operators) // DropoutBackward<<::~PoolingLayer(){ OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) ); OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) ); OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) ); - OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) ); + OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) ); OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) ); } diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 973db6e7..07c2fcfc 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -24,6 +24,27 @@ void SoftmaxLayer::Reshape(const vector*>& bottom, scale_.Reshape(scale_dims); } +template +void SoftmaxLayer::ocl_setup(){ + cl_int err = 0; + channel_max_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_max_float", &err); + channel_subtract_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_subtract_float", &err);; + exp_kernel = clCreateKernel(amdDevice.Program, "kernel_exp_float", &err);; + channel_sum_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_sum_float", &err);; + channel_div_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_div_float", &err);; + channel_dot_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_dot_float", &err);; +} + +template +SoftmaxLayer::~SoftmaxLayer(){ + clReleaseKernel(channel_max_kernel); + clReleaseKernel(channel_subtract_kernel); + clReleaseKernel(exp_kernel); + clReleaseKernel(channel_sum_kernel); + clReleaseKernel(channel_div_kernel); + clReleaseKernel(channel_dot_kernel); +} + template void SoftmaxLayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { @@ -88,16 +109,60 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Forward_cpu(bottom, top); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = bottom[0]->count(); + int channels = top[0]->shape(softmax_axis_); + + caffe_gpu_copy(count, bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + // compute max + // NOLINT_NEXT_LINE(whitespace/operators) + + kernel_channel_max(channel_max_kernel, outer_num_, channels, inner_num_, top_data, + scale_data); + // subtract + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract(channel_subtract_kernel, count, outer_num_, channels, inner_num_, + scale_data, top_data); + // exponentiate + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_exp(exp_kernel, count, top_data, top_data); + // sum after exp + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_sum(channel_sum_kernel, outer_num_, channels, inner_num_, top_data, + scale_data); + // divide + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_div(channel_div_kernel, count, outer_num_, channels, inner_num_, + scale_data, top_data); } template void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - Backward_cpu(top, propagate_down, bottom); -} + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = top[0]->count(); + int channels = top[0]->shape(softmax_axis_); + caffe_gpu_copy(count, top_diff, bottom_diff); + // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. + // NOLINT_NEXT_LINE(whitespace/operators) + + kernel_channel_dot(channel_dot_kernel, outer_num_, channels, inner_num_, + top_diff, top_data, scale_data); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract(channel_subtract_kernel, count, outer_num_, channels, inner_num_, + scale_data, bottom_diff); + // elementwise multiplication + caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); +} #ifdef CPU_ONLY diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 2d4b1da9..715297a6 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -27,13 +27,14 @@ void Solver::ocl_setup(){ powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } -/* -template -Solver::~Solver(){ +//template +/*Solver::~Solver(){ OCL_CHECK( clReleaseKernel(scalar_kernel) ); + OCL_CHECK( clReleaseKernel(add_kernel) ); OCL_CHECK( clReleaseKernel(div_kernel) ); OCL_CHECK( clReleaseKernel(powx_kernel) ); -}*/ +} +*/ template Solver::Solver(const string& param_file) diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 501794dc..31384eb6 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -133,6 +133,125 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p template float softmax_gpu(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss); template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); +template +void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* out) +{ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {num*spatial_dim}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const float* data, float* out); +template void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const double* data, double* out); + +template +void kernel_channel_subtract(cl_kernel Kernel, const int count, + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data){ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) ); + OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); + + size_t Global_Work_Size[1] = {count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_channel_subtract(cl_kernel Kernel, const int count, + const int num, const int channels, + const int spatial_dim, const float* channel_max, float* data); +template void kernel_channel_subtract(cl_kernel Kernel, const int count, + const int num, const int channels, + const int spatial_dim, const double* channel_max, double* data); + +template +void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out) +{ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_exp(cl_kernel Kernel, const int count, const float* data, float* out); +template void kernel_exp(cl_kernel Kernel, const int count, const double* data, double* out); + +template +void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* channel_sum) +{ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); + + size_t Global_Work_Size[1] = {num*channels}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum); +template void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum); + +template +void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_sum, Dtype* data) +{ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); + OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); + + size_t Global_Work_Size[1] = {count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, + const int spatial_dim, const float* channel_sum, float* data); +template void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, + const int spatial_dim, const double* channel_sum, double* data); + +template +void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot) +{ + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data_1) ); + OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) ); + OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) ); + + size_t Global_Work_Size[1] = {num*spatial_dim}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot); +template void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, + const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot); + template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, diff --git a/tools/caffe.cpp b/tools/caffe.cpp index 0b7523fc..e350866f 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -291,8 +291,9 @@ int time() { RegisterBrewFunction(time); int main(int argc, char** argv) { + FLAGS_log_dir = "./log/"; // Print output to stderr (while still logging). - FLAGS_alsologtostderr = 1; + FLAGS_alsologtostderr = 0; // Usage message. gflags::SetUsageMessage("command line brew\n" "usage: caffe \n\n" From 75668464c2765a6c927342154bd217d1a6cc2eca Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 1 Aug 2015 00:21:29 +0800 Subject: [PATCH 014/124] cleanup the kernel interface of conv, relu and dropout --- include/caffe/util/im2col.hpp | 14 +++++ include/caffe/util/ocl_wrapper.hpp | 8 +-- include/caffe/vision_layers.hpp | 18 +++--- src/caffe/OCL_kernel.cl | 91 ++++++++++++++++++++++++++- src/caffe/layers/base_conv_layer.cpp | 8 +-- src/caffe/layers/dropout_layer.cpp | 6 +- src/caffe/layers/relu_layer.cpp | 4 +- src/caffe/util/im2col.cpp | 93 ++++++++++++++++++++++++++++ src/caffe/util/math_functions.cpp | 7 ++- src/caffe/util/ocl_wrapper.cpp | 24 +++---- tools/caffe.cpp | 4 +- 11 files changed, 237 insertions(+), 40 deletions(-) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 066eb2fc..306a5d16 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -15,6 +15,20 @@ void col2im_cpu(const Dtype* data_col, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset); + +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset); template void im2col_gpu(const Dtype* data_im, const int channels, diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 5e86b1e2..35ad695e 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -68,19 +68,19 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); template -void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); +void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); template -void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); +void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); template void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); +void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); template -void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); +void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); template void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index b46130e8..4ccdeb80 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -91,16 +91,16 @@ class BaseConvolutionLayer : public Layer { } #ifndef CPU_ONLY inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { -// im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, -// kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); - im2col_gpu(im2col_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0); + im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0); + // im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, + // conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0); } inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - // col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - // kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); - col2im_gpu(col2im_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, pad_h_, stride_h_, data, bottom_offset_); + col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); + // col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + // kernel_h_, pad_h_, stride_h_, data, bottom_offset_); } #endif @@ -119,7 +119,7 @@ class BaseConvolutionLayer : public Layer { //opencl related data structures protected: - cl_kernel im2col_kernel, col2im_kernel; + cl_kernel im2col_gpu_kernel, col2im_gpu_kernel; cl_kernel oclmem_kernel; cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat; cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index df6e42ce..07f8eb16 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -783,7 +783,7 @@ __kernel void im2col(const int n, __global T* data_im, const int img_offset, con } } -template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); template @@ -823,6 +823,95 @@ template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); +template +__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int index = get_global_id(0); + if(index < n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + __global T* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + __global const T* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel(const int n, __global const float* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2col_gpu_double_kernel))) void im2col_gpu_kernel(const int n, __global const double* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); + +template +__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { + data_col = data_col + col_offset; + data_im = data_im + img_offset; + int index = get_global_id(0); + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} + +template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + template __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){ int index = get_global_id(0); diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 38d8952d..f321c9ff 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -32,8 +32,8 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) template void BaseConvolutionLayer::ocl_setup() { - im2col_kernel = clCreateKernel(amdDevice.Program,"im2colfloat", NULL); - col2im_kernel = clCreateKernel(amdDevice.Program,"col2imfloat", NULL); + im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL); + col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL); oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL); @@ -53,8 +53,8 @@ void BaseConvolutionLayer::ocl_setup() { template BaseConvolutionLayer::~BaseConvolutionLayer(){ - OCL_CHECK( clReleaseKernel(im2col_kernel) ); - OCL_CHECK( clReleaseKernel(col2im_kernel) ); + OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) ); + OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) ); OCL_CHECK( clReleaseKernel(oclmem_kernel) ); OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) ); OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index d08805d1..996098bc 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -100,11 +100,11 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, static_cast(rand_vec_.mutable_cpu_data()); caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); - Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else // caffe_gpu_rng_uniform(count, mask); caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); - Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #endif // set thresholds // NOLINT_NEXT_LINE(whitespace/operators) @@ -135,7 +135,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, // CAFFE_CUDA_NUM_THREADS>>>( // count, top_diff, mask, uint_thres_, scale_, bottom_diff); // CUDA_POST_KERNEL_CHECK; - Dropout_bp_gpu(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); + DropoutBackward(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); } else { caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); } diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index d7b0a838..8690e938 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -67,7 +67,7 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, // << " top_data: " << (unsigned long)top_data // << " blocks: " << CAFFE_GET_BLOCKS(count) // << " threads: " << CAFFE_CUDA_NUM_THREADS; - Relu_fp_gpu(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); + ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); } @@ -85,7 +85,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, // ReLUBackward<<>>( // count, top_diff, bottom_data, bottom_diff, negative_slope); // CUDA_POST_KERNEL_CHECK; - Relu_bp_gpu(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); + ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); } } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index ac44f425..0c48257d 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -121,6 +121,99 @@ template void col2im_gpu(const double* data_col, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im); */ + +//cannot use now, need to modify kernel. +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset) +{ + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&kernel_h); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&kernel_w); + + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad_h); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_w); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&stride_h); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_w); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + clFinish(amdDevice.CommandQueue); + +} + +template void im2col_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col, const int col_offset); +template void im2col_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col, const int col_offset); + +//cannot use now, need to modify kernel +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset) +{ + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&patch_h); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&patch_w); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_h); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&pad_w); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_h); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&stride_w); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset); + + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + +template void col2im_gpu(cl_kernel Kernel, const float* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, float* data_im, const int img_offset); +template void col2im_gpu(cl_kernel Kernel, const double* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w,const int stride_h, const int stride_w, + double* data_im, const int img_offset); + template void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 85af49d1..9ba72e41 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -240,13 +240,14 @@ void caffe_copy(const int N, const double* X, double* Y) { template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { - CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); - + if(X != Y) + CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { - CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + if(X != Y) + CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 31384eb6..7b57d329 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -633,7 +633,7 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const fl template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); template -void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ +void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); @@ -645,11 +645,11 @@ void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dt OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Relu_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope); -template void Relu_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope); +template void ReLUForward(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope); +template void ReLUForward(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope); template -void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ +void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); @@ -662,8 +662,8 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void Relu_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); -template void Relu_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); +template void ReLUBackward(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); +template void ReLUBackward(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); template void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { @@ -878,7 +878,7 @@ template void caffe_gpu_powx (cl_kernel Kernel, const int n, const float* template void caffe_gpu_powx (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y); template -void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) +void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) { cl_int ret; ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); @@ -893,11 +893,11 @@ void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Dropout_fp_gpu(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); -template void Dropout_fp_gpu(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); +template void DropoutForward(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); +template void DropoutForward(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); template -void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) +void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) { cl_int ret; ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); @@ -912,7 +912,7 @@ void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, co size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Dropout_bp_gpu(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); -template void Dropout_bp_gpu(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); +template void DropoutBackward(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); +template void DropoutBackward(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); } // namespace caffe diff --git a/tools/caffe.cpp b/tools/caffe.cpp index e350866f..df3b390a 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -291,9 +291,9 @@ int time() { RegisterBrewFunction(time); int main(int argc, char** argv) { - FLAGS_log_dir = "./log/"; + // FLAGS_log_dir = "./log/"; // Print output to stderr (while still logging). - FLAGS_alsologtostderr = 0; + FLAGS_alsologtostderr = 1; // Usage message. gflags::SetUsageMessage("command line brew\n" "usage: caffe \n\n" From 77c7cb9a9625f72594ca972d44d180cdc4dff249 Mon Sep 17 00:00:00 2001 From: Junli Date: Sun, 2 Aug 2015 11:58:06 +0800 Subject: [PATCH 015/124] minor fix --- src/caffe/layers/base_conv_layer.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index f321c9ff..729fafb2 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -330,11 +330,6 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, col_buff = input; } for (int g = 0; g < group_; ++g) { - /* caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype)0., col_buff + col_offset_ * g); - */ caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights, weight_offset_ * g, output, top_offset_+output_offset_ * g, From 102649bc589eee4ab050fb00cdd5eaffdbcdee8e Mon Sep 17 00:00:00 2001 From: junli Date: Sun, 2 Aug 2015 12:15:41 +0800 Subject: [PATCH 016/124] Ported optimized scheme for conv layer --- include/caffe/common.hpp | 6 +- include/caffe/util/im2col.hpp | 4 +- include/caffe/vision_layers.hpp | 18 ++- src/caffe/OCL_kernel.cl | 8 +- src/caffe/layers/base_conv_layer.cpp | 184 ++++++++++++++++++++++++++- src/caffe/layers/conv_layer.cpp | 54 +++++++- src/caffe/net.cpp | 16 +++ src/caffe/util/im2col.cpp | 71 +++++------ tools/caffe.cpp | 4 +- 9 files changed, 308 insertions(+), 57 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 07d26556..070513b5 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -22,6 +22,8 @@ #include "caffe/device.hpp" #include "caffe/util/device_alternate.hpp" #include "caffe/util/ocl_wrapper.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/im2col.hpp" // gflags 2.1 issue: namespace google was changed to gflags without warning. // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version @@ -79,10 +81,10 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -//#define use_packing_scheme 1 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -//#define global_packing_N 16 +#define global_packing_N 16 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 306a5d16..862a539b 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -55,7 +55,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, template void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, const int optnum); + const int stride, Dtype* data_col, const int col_offset, int optnum); template void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, @@ -65,7 +65,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c template void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, const int optnum); + const int stride, Dtype* data_im, const int img_offset, int optnum); template void col2im_gpu_ocl(cl_mem data_col, const int channels, diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 4ccdeb80..2d8f6390 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -113,13 +113,18 @@ class BaseConvolutionLayer : public Layer { int weight_offset_; int col_offset_; int output_offset_; + int M_, N_, K_; Blob col_buffer_; Blob bias_multiplier_; //opencl related data structures protected: - cl_kernel im2col_gpu_kernel, col2im_gpu_kernel; + void forward_gpu_opt(const vector*>& bottom, const Dtype* weight, + const vector*>& top, bool skip_im2col = false) ; + void backward_gpu_opt(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + cl_kernel im2col_kernel, col2im_kernel; cl_kernel oclmem_kernel; cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat; cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; @@ -184,7 +189,7 @@ class ConvolutionLayer : public BaseConvolutionLayer { virtual inline const char* type() const { return "Convolution"; } - protected: +protected: virtual void Forward_cpu(const vector*>& bottom, const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, @@ -195,6 +200,15 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector& propagate_down, const vector*>& bottom); virtual inline bool reverse_dimensions() { return false; } virtual void compute_output_shape(); + + virtual void Forward_gpu_org(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_org(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Forward_gpu_opt(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_opt(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 07f8eb16..48076725 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -1310,17 +1310,17 @@ void StoPoolBackward(const int nthreads, } } template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* const top_diff, + __global float* rand_idx, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* const bottom_diff); + const int stride_w, __global float* bottom_diff); template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* const top_diff, + __global double* rand_idx, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* const bottom_diff); + const int stride_w, __global double* bottom_diff); template __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 729fafb2..58eb0e1f 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -5,6 +5,7 @@ #include "caffe/util/im2col.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/common.hpp" namespace caffe { @@ -38,15 +39,18 @@ void BaseConvolutionLayer::ocl_setup() { im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL); opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL); - ocl_Kernel_im2colfloat = clCreateKernel(amdDevice.Program,"im2colfloat_yuan",NULL); - ocl_Kernel_col2imfloat = clCreateKernel(amdDevice.Program,"col2imfloat_yuan",NULL); ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL); ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL); + M_ = conv_out_channels_ / group_; + K_ = kernel_dim_ / group_; + N_ = conv_out_spatial_dim_; + #ifdef use_packing_scheme size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); Alloc_public_tmp_mem(subtop_size, trans_size); + //printf("K_ =%d, N_=%d M_=%d, group_=%d, trans_size = %d, subtop_size=%d \n", K_, N_, M_, group_, trans_size, subtop_size); #endif } @@ -58,8 +62,6 @@ template OCL_CHECK( clReleaseKernel(oclmem_kernel) ); OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) ); OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_im2colfloat) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_col2imfloat) ); OCL_CHECK( clReleaseKernel(im2col_opt_kernel) ); OCL_CHECK( clReleaseKernel(col2im_opt_kernel) ); } @@ -126,8 +128,6 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, conv_in_channels_ = channels_; } - //initializa OpenCL kernels and cl_mem objects - ocl_setup(); // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights @@ -214,6 +214,8 @@ void BaseConvolutionLayer::Reshape(const vector*>& bottom, caffe_set(bias_multiplier_.count(), Dtype(1), bias_multiplier_.mutable_cpu_data()); } + //initializa OpenCL kernels and cl_mem objects + ocl_setup(); } template @@ -371,6 +373,176 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, bias, (size_t)0, 1); } + +template +void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bottom, const Dtype* weight, const vector*>& top, bool skip_im2col){ + + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + Dtype* top_data = top[i]->mutable_gpu_data(); + + Dtype* col_data = col_buffer_.mutable_gpu_data(); + /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. + N' is the M in sgemm call.*/ + int M_org = M_ * group_; + int col_offset = K_ * N_; + int top_offset = M_ * N_; + int weight_offset = M_ * K_; + int opt_num2 = global_packing_N; + cl_command_queue Queue; + cl_event prof_event; + //LOG(INFO) << "conv_fp optimized scheme"; + for (int n = 0; n < num_; n += opt_num2) { + opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; + /*col_offset is the offset for sgemm, including packing and groups + for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ + top_offset = M_ * N_ * opt_num2; + col_offset = K_ * N_ * opt_num2; + //step1: packed im2col, col_size = (K_ * group_ ) * N_ + //this should be opt_num2 images packing together. + im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + + //step 2: sgemm: Top (subTopMem) = weight * col_data +#ifdef multiQ + for (int g = 0; g < group_; ++g) { + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; + prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, + (Dtype)0., (Dtype*)subTopMem, top_offset * g); + } + //sync two command queues + if(group_ == 2){ + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#else + Queue = amdDevice.CommandQueue; + //printf("M_=%d, N_=%d, K_=%d, opt_num2=%d, col_offset=%d, top_offset=%d, weight_offset=%d \n", M_, N_, K_, opt_num2, col_offset, top_offset, weight_offset); + for (int g = 0; g < group_; ++g) { + prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, + (Dtype)0., (Dtype*)subTopMem, top_offset * g); + } +#endif + //step 3: tranform + transform_gpu(ocl_Kernel_transform, (Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); + //step 4: add bias + /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/ + + for (int z = 0; z < opt_num2; z++) + if (bias_term_) { + caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, num_output_, + N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z); + } + } +} +} + +template +void BaseConvolutionLayer::backward_gpu_opt(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count()); + for (int n = 0; n < num_; ++n) { + caffe_gpu_gemvv(CblasNoTrans, M_, N_, + (Dtype)1., top_diff, top[i]->offset(n), N_, + reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, + bias_diff, (size_t)0, 1); + } + } + + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + Dtype* col_data = col_buffer_.mutable_gpu_data(); + Dtype* col_diff = col_buffer_.mutable_gpu_diff(); + int col_offset = K_ * N_; + int top_offset = M_ * N_; + int weight_offset = M_ * K_; + int opt_num2 = global_packing_N; + int g = 0; + cl_command_queue Queue; + cl_event prof_event; + //LOG(INFO) << "conv_bp optimized scheme"; + + for (int n = 0; n < num_; n += opt_num2) { + opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; + /*col_offset is the offset for sgemm, including packing and groups + for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ + top_offset = M_ * (N_ * opt_num2); + col_offset = K_ * (N_ * opt_num2); + //step1: packed im2col, col_size = (K_ * group_ ) * N_ + //this should be opt_num2 images packing together. + im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + + //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize + int height_top = M_ * group_, width_top = N_; + //if (opt_num2 >1) + opttrans(opttrans_kernel, top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); + + //step 3: sgemm: Top (subTopMem) = weight * col_data + for(g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + (Dtype)1., (Dtype*)subTopMem, top_offset * g, + (Dtype*)transMem, col_offset * g, (Dtype)1., + (Dtype*)weight_diff, weight_offset * g); + } + + //step4: + if (propagate_down[i]) { + for (g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + prof_event = caffe_gpu_gemmex(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_, + (Dtype)1., weight, weight_offset * g, + (Dtype*)subTopMem, top_offset * g, + (Dtype)0., (Dtype*)transMem, col_offset * g); + } + } + +#ifdef multiQ + if(group_ ==2){ + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#endif + + //step5: col2im + col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); +#ifdef Track_layer + LOG(WARNING) << "conv bp done"; +#endif + + } + } + } +} + #endif // !CPU_ONLY INSTANTIATE_CLASS(BaseConvolutionLayer); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 960073f2..8a6a3743 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -75,6 +75,52 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + if (use_packing_scheme && global_packing_N >1) + Forward_gpu_opt(bottom, top); + else + Forward_gpu_org(bottom, top); +} + +template +void ConvolutionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (use_packing_scheme && global_packing_N >1) + Backward_gpu_opt(top, propagate_down, bottom); + else + Backward_gpu_org(top, propagate_down, bottom); +} + +template +void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom, + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + this->forward_gpu_opt(bottom, weight, top); + +/* +#ifdef check_gradient + const Dtype *cpu_bottom_data = bottom[0]->cpu_data(); Dtype *cpu_top_data = (Dtype*)(*top)[0]->cpu_data(); + + printf("\n\nbottom data GPU:\n"); + for(int i=0; i +void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom, const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { @@ -99,9 +145,13 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); CHECK_BLOB_DATA(top[0],20, "top[0]"); } - template -void ConvolutionLayer::Backward_gpu(const vector*>& top, +void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + //this->backward_gpu_opt(top, propagate_down, bottom); +} +template +void ConvolutionLayer::Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 4de7a146..9869b33f 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -13,6 +13,7 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/upgrade_proto.hpp" +#include "caffe/util/benchmark.hpp" #include "caffe/test/test_caffe_main.hpp" @@ -503,6 +504,10 @@ Dtype Net::ForwardFromTo(int start, int end) { InputDebugInfo(i); } } + + CPUTimer forward_timer; + forward_timer.Start(); + for (int i = start; i <= end; ++i) { // LOG(ERROR) << "Forwarding " << layer_names_[i]; //Yibing add for porting @@ -513,6 +518,10 @@ Dtype Net::ForwardFromTo(int start, int end) { //Yibing add for porting clFinish(amdDevice.CommandQueue); } + + forward_timer.Stop(); + printf("Forward time: %f\n\n", forward_timer.MilliSeconds()); + return loss; } @@ -571,6 +580,10 @@ template void Net::BackwardFromTo(int start, int end) { CHECK_GE(end, 0); CHECK_LT(start, layers_.size()); + + CPUTimer backward_timer; + backward_timer.Start(); + for (int i = start; i >= end; --i) { if (layer_need_backward_[i]) { //Yibing add for porting @@ -582,6 +595,9 @@ void Net::BackwardFromTo(int start, int end) { clFinish(amdDevice.CommandQueue); } } + + backward_timer.Stop(); + printf("Backward time: %f\n\n", backward_timer.MilliSeconds()); } template diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 0c48257d..b9257675 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -82,45 +82,42 @@ template void col2im_cpu(const double* data_col, const int channels, const int stride_w, double* data_im); -/* template -void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { - -} +void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, int optnum){ + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); + ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); + ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); + ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset); + ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); + OCL_CHECK(ret); -// Explicit instantiation -template void im2col_gpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col); -template void im2col_gpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col); -*/ -/* -template -void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im) { + size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation -template void col2im_gpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); -template void col2im_gpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); -*/ +template void col2im_gpu_opt(cl_kernel kernel, const float* data_col, const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_im, const int img_offset, int optnum); +template void col2im_gpu_opt(cl_kernel kernel, const double* data_col, const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_im, const int img_offset, int optnum); //cannot use now, need to modify kernel. template @@ -290,7 +287,7 @@ template void im2col_16_gpu(cl_kernel Kernel, const double* data_im, con template void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, const int optnum) { + const int stride, Dtype* data_col, const int col_offset, int optnum) { int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; @@ -320,10 +317,10 @@ void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset template void im2col_opt_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset, const int optnum); + const int stride, float* data_col, const int col_offset, int optnum); template void im2col_opt_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset, const int optnum); + const int stride, double* data_col, const int col_offset, int optnum); template void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, diff --git a/tools/caffe.cpp b/tools/caffe.cpp index df3b390a..e350866f 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -291,9 +291,9 @@ int time() { RegisterBrewFunction(time); int main(int argc, char** argv) { - // FLAGS_log_dir = "./log/"; + FLAGS_log_dir = "./log/"; // Print output to stderr (while still logging). - FLAGS_alsologtostderr = 1; + FLAGS_alsologtostderr = 0; // Usage message. gflags::SetUsageMessage("command line brew\n" "usage: caffe \n\n" From b09b8a4af959f3f1d8a0619bb2d8d3392fe72467 Mon Sep 17 00:00:00 2001 From: junli Date: Sun, 2 Aug 2015 12:32:05 +0800 Subject: [PATCH 017/124] added Makefile.config. As needed by fresh git clone; then make all --- Makefile.config | 93 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 Makefile.config diff --git a/Makefile.config b/Makefile.config new file mode 100644 index 00000000..2d8124d6 --- /dev/null +++ b/Makefile.config @@ -0,0 +1,93 @@ +## Refer to http://caffe.berkeleyvision.org/installation.html +# Contributions simplifying and improving our build system are welcome! + +# cuDNN acceleration switch (uncomment to build with cuDNN). +# USE_CUDNN := 1 + +# CPU-only switch (uncomment to build without GPU support). +# CPU_ONLY := 1 + +# To customize your choice of compiler, uncomment and set the following. +# N.B. the default for Linux is g++ and the default for OSX is clang++ +# CUSTOM_CXX := g++ + +# CUDA directory contains bin/ and lib/ directories that we need. +CUDA_DIR := /usr/local/cuda +# On Ubuntu 14.04, if cuda tools are installed via +# "sudo apt-get install nvidia-cuda-toolkit" then use this instead: +# CUDA_DIR := /usr + +# CUDA architecture setting: going with all of them. +# For CUDA < 6.0, comment the *_50 lines for compatibility. +CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ + -gencode arch=compute_20,code=sm_21 \ + -gencode arch=compute_30,code=sm_30 \ + -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_50,code=sm_50 \ + -gencode arch=compute_50,code=compute_50 + +# BLAS choice: +# atlas for ATLAS (default) +# mkl for MKL +# open for OpenBlas +BLAS := atlas +# Custom (MKL/ATLAS/OpenBLAS) include and lib directories. +# Leave commented to accept the defaults for your choice of BLAS +# (which should work)! +# BLAS_INCLUDE := /path/to/your/blas +# BLAS_LIB := /path/to/your/blas + +# Homebrew puts openblas in a directory that is not on the standard search path +# BLAS_INCLUDE := $(shell brew --prefix openblas)/include +# BLAS_LIB := $(shell brew --prefix openblas)/lib + +# This is required only if you will compile the matlab interface. +# MATLAB directory should contain the mex binary in /bin. +# MATLAB_DIR := /usr/local +# MATLAB_DIR := /Applications/MATLAB_R2012b.app + +# NOTE: this is required only if you will compile the python interface. +# We need to be able to find Python.h and numpy/arrayobject.h. +PYTHON_INCLUDE := /usr/include/python2.7 \ + /usr/lib/python2.7/dist-packages/numpy/core/include +# Anaconda Python distribution is quite popular. Include path: +# Verify anaconda location, sometimes it's in root. +# ANACONDA_HOME := $(HOME)/anaconda +# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \ + # $(ANACONDA_HOME)/include/python2.7 \ + # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \ + +# We need to be able to find libpythonX.X.so or .dylib. +PYTHON_LIB := /usr/lib +# PYTHON_LIB := $(ANACONDA_HOME)/lib + +# Homebrew installs numpy in a non standard path (keg only) +# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include +# PYTHON_LIB += $(shell brew --prefix numpy)/lib + +# Uncomment to support layers written in Python (will link against Python libs) +# WITH_PYTHON_LAYER := 1 + +# Whatever else you find you need goes here. +INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include +LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib + +# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies +# INCLUDE_DIRS += $(shell brew --prefix)/include +# LIBRARY_DIRS += $(shell brew --prefix)/lib + +# Uncomment to use `pkg-config` to specify OpenCV library paths. +# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.) +# USE_PKG_CONFIG := 1 + +BUILD_DIR := build +DISTRIBUTE_DIR := distribute + +# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171 + DEBUG := 1 + +# The ID of the GPU that 'make runtest' will use to run unit tests. +TEST_GPUID := 0 + +# enable pretty build (comment to see full commands) +Q ?= @ From 270a7d9b43529fce9bac2df00a535282a8167dc4 Mon Sep 17 00:00:00 2001 From: junli Date: Sun, 2 Aug 2015 13:31:48 +0800 Subject: [PATCH 018/124] fix some merge bugs; add ./models into git --- include/caffe/vision_layers.hpp | 5 ++--- src/caffe/layers/base_conv_layer.cpp | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 2d8f6390..a9c644c2 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -124,11 +124,10 @@ class BaseConvolutionLayer : public Layer { const vector*>& top, bool skip_im2col = false) ; void backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - cl_kernel im2col_kernel, col2im_kernel; + cl_kernel im2col_gpu_kernel, col2im_gpu_kernel; + cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel; cl_kernel oclmem_kernel; - cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat; cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; - cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel; public: static cl_mem subTopMem, transMem; static size_t subtop_mem_size, trans_mem_size; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 58eb0e1f..5384c5e8 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -50,7 +50,6 @@ void BaseConvolutionLayer::ocl_setup() { size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); Alloc_public_tmp_mem(subtop_size, trans_size); - //printf("K_ =%d, N_=%d M_=%d, group_=%d, trans_size = %d, subtop_size=%d \n", K_, N_, M_, group_, trans_size, subtop_size); #endif } @@ -413,14 +412,12 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, (Dtype)0., (Dtype*)subTopMem, top_offset * g); } - //sync two command queues if(group_ == 2){ clFinish(amdDevice.CommandQueue); clFinish(amdDevice.CommandQueue_helper); } #else Queue = amdDevice.CommandQueue; - //printf("M_=%d, N_=%d, K_=%d, opt_num2=%d, col_offset=%d, top_offset=%d, weight_offset=%d \n", M_, N_, K_, opt_num2, col_offset, top_offset, weight_offset); for (int g = 0; g < group_; ++g) { prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, From 3e3fb86391ca3dd104b23b5b3b1caf1388633f85 Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 4 Aug 2015 01:03:12 +0800 Subject: [PATCH 019/124] cleaned up sgemm_ex interfaces; re-organized the relu layer kernel and wrappers --- include/caffe/device.hpp | 5 +++- include/caffe/neuron_layers.hpp | 13 ++++++++-- include/caffe/util/math_functions.hpp | 4 ++-- src/caffe/device.cpp | 30 +++++++++++++++++++++++- src/caffe/layers/base_conv_layer.cpp | 18 +++++++------- src/caffe/layers/conv_layer.cpp | 2 +- src/caffe/layers/inner_product_layer.cpp | 8 +++---- src/caffe/layers/relu_layer.cl | 22 +++++++++++++++++ src/caffe/layers/relu_layer.cpp | 9 +++---- src/caffe/util/math_functions.cpp | 10 ++++---- 10 files changed, 93 insertions(+), 28 deletions(-) create mode 100644 src/caffe/layers/relu_layer.cl diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 07e65848..7360dacd 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -20,9 +20,10 @@ class Device{ cl_command_queue CommandQueue; cl_command_queue CommandQueue_helper; cl_program Program; + cl_device_id * pDevices; clblasOrder col; clblasOrder row; - + cl_int Init(); cl_int ConvertToString(const char *pFileName,std::string &Str); @@ -30,6 +31,8 @@ class Device{ void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); void GetDeviceInfo(); + + cl_program BuildProgram(const char*); template void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str); diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 67d5e0b2..bcb834de 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -9,6 +9,7 @@ #include "caffe/common.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" +#include "caffe/util/ocl_wrapper.hpp" #define HDF5_DATA_DATASET_NAME "data" #define HDF5_DATA_LABEL_NAME "label" @@ -487,9 +488,17 @@ class ReLULayer : public NeuronLayer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); -//OpenCL related setiup +//OpenCL related setup void ocl_setup(); - +//OpenCL wrapper + void ReLUForward_gpu(int count, const Dtype *bottom_data,Dtype *top_data, Dtype negative_slope) + { + ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); + } + void ReLUBackward_gpu(int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff, Dtype negative_slope) + { + ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); + } protected: cl_kernel ReLUForward_kernel; cl_kernel ReLUBackward_kernel; diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index c2720cf5..c9a391ac 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -31,7 +31,7 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, Dtype* C); template -cl_event caffe_gpu_gemmex( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, Dtype* C, const int offC); @@ -44,7 +44,7 @@ void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, template -cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, Dtype* C, const int offC); diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index bce26316..0e98ada0 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -51,7 +51,6 @@ cl_int Device::Init(){ //printf("%s %s\n", platformName, openclVersion); GetDeviceInfo(); - cl_device_id * pDevices; cl_uint uiNumDevices; cl_bool unified_memory = false; switch(Caffe::mode()) { @@ -233,6 +232,35 @@ cl_int Device::ConvertToString(const char *pFileName,std::string &Str){ return -1; } +cl_program Device::BuildProgram(const char *pFileName) +{ + //Read our own kernel file + const char *pSource; + std::string strSource = ""; + ConvertToString(pFileName, strSource); + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = {0}; + uiArrSourceSize[0] = strlen(pSource); + cl_program program = NULL; + program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL); + if(NULL == program){ + fprintf(stderr,"Err: Failed to create program\n"); + } + + //Build Program + cl_int iStatus = clBuildProgram(program, 1, pDevices, buildOption, NULL, NULL); + LOG(INFO) << "Build Program"; + if(CL_SUCCESS != iStatus){ + fprintf(stderr,"Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram(program); + return NULL; + } + return program; +} + void Device::DisplayPlatformInfo(){ cl_int err; size_t size; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 5384c5e8..4bb1d1e6 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -304,7 +304,7 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, (Dtype)0., output + output_offset_ * g); */ //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count()); - caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, + caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, (Dtype)0., output, top_offset_+output_offset_ * g); @@ -317,7 +317,7 @@ void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, /*caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(), (Dtype)1., output);*/ - caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, num_output_, + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, height_out_*width_out_, 1, (Dtype)1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype)1., output, top_offset_); @@ -331,7 +331,7 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, col_buff = input; } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, + caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights, weight_offset_ * g, output, top_offset_+output_offset_ * g, (Dtype)0., col_buff, col_offset_ * g); @@ -354,7 +354,7 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, (Dtype)1., weights + weight_offset_ * g);*/ - caffe_gpu_gemmex(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, + caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype)1., output, top_offset_, (Dtype*)col_buff, col_offset_ * g, (Dtype)1., (Dtype*)weights, weight_offset_ * g); @@ -408,7 +408,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; else Queue = amdDevice.CommandQueue_helper; - prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, (Dtype)0., (Dtype*)subTopMem, top_offset * g); } @@ -419,7 +419,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo #else Queue = amdDevice.CommandQueue; for (int g = 0; g < group_; ++g) { - prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, (Dtype)0., (Dtype*)subTopMem, top_offset * g); } @@ -431,7 +431,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int z = 0; z < opt_num2; z++) if (bias_term_) { - caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, num_output_, + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z); @@ -499,7 +499,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t #else Queue = amdDevice.CommandQueue; #endif - prof_event = caffe_gpu_gemmex(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, (Dtype)1., (Dtype*)subTopMem, top_offset * g, (Dtype*)transMem, col_offset * g, (Dtype)1., (Dtype*)weight_diff, weight_offset * g); @@ -514,7 +514,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t #else Queue = amdDevice.CommandQueue; #endif - prof_event = caffe_gpu_gemmex(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_, + prof_event = caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_, (Dtype)1., weight, weight_offset * g, (Dtype*)subTopMem, top_offset * g, (Dtype)0., (Dtype*)transMem, col_offset * g); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 8a6a3743..77697023 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -148,7 +148,7 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom template void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - //this->backward_gpu_opt(top, propagate_down, bottom); + this->backward_gpu_opt(top, propagate_down, bottom); } template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 03dbbeb5..4242afa3 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -125,10 +125,10 @@ void InnerProductLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., + caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); if (bias_term_) { - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., + caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., bias_multiplier_.gpu_data(),0, this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); } @@ -142,7 +142,7 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight - caffe_gpu_gemm_ex(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); } if (bias_term_ && this->param_propagate_down_[1]) { @@ -156,7 +156,7 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, if (propagate_down[0]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bottom data - caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., bottom[0]->mutable_gpu_diff(), 0); } diff --git a/src/caffe/layers/relu_layer.cl b/src/caffe/layers/relu_layer.cl new file mode 100644 index 00000000..cebe24cd --- /dev/null +++ b/src/caffe/layers/relu_layer.cl @@ -0,0 +1,22 @@ +template +__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ + int index = get_global_id(0); + if(index < count) + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; +} + +//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); + +template +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ + int index = get_global_id(0); + if(index < count) + out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; +} + +template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); + + diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 8690e938..6ee3237a 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -7,9 +7,10 @@ namespace caffe { template void ReLULayer::ocl_setup(){ + cl_program program = amdDevice.BuildProgram("src/caffe/layers/relu_layer.cl"); cl_int _err=0; - ReLUForward_kernel = clCreateKernel(amdDevice.Program,"ReLUForwardfloat",&_err); - ReLUBackward_kernel = clCreateKernel(amdDevice.Program,"ReLUBackwardfloat",&_err); + ReLUForward_kernel = clCreateKernel(program,"ReLUForwardfloat",&_err); + ReLUBackward_kernel = clCreateKernel(program,"ReLUBackwardfloat",&_err); } template @@ -67,7 +68,7 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, // << " top_data: " << (unsigned long)top_data // << " blocks: " << CAFFE_GET_BLOCKS(count) // << " threads: " << CAFFE_CUDA_NUM_THREADS; - ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); + ReLUForward_gpu(count,bottom_data,top_data,negative_slope); } @@ -85,7 +86,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, // ReLUBackward<<>>( // count, top_diff, bottom_data, bottom_diff, negative_slope); // CUDA_POST_KERNEL_CHECK; - ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); + ReLUBackward_gpu(count,top_diff,bottom_data,bottom_diff,negative_slope); } } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 9ba72e41..8bc16ea3 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -64,7 +64,7 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, } template <> -cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { cl_event event; @@ -78,7 +78,7 @@ cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, } template <> -cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { cl_event event; @@ -93,7 +93,7 @@ cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA, template <> -cl_event caffe_gpu_gemmex(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { cl_event event; @@ -108,7 +108,7 @@ cl_event caffe_gpu_gemmex(cl_command_queue *queue, const CBLAS_TRANSPOSE } template <> -cl_event caffe_gpu_gemmex(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, +cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { cl_event event; @@ -620,12 +620,14 @@ template<> void caffe_gpu_sign(const int N, const float *X, float *Y){ cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); + clReleaseKernel(caffe_gpu_sign_kernel); } template<> void caffe_gpu_sign(const int N, const double *X, double *Y){ cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); + clReleaseKernel(caffe_gpu_sign_kernel); } template <> From 18257693490162cb3cc894d6fffb97fe457e7ad9 Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 4 Aug 2015 04:43:24 +0800 Subject: [PATCH 020/124] cleaning up the conv opt interfaces --- src/caffe/layers/base_data_layer.cpp | 10 +++++++++- src/caffe/net.cpp | 14 +++++++++----- src/caffe/util/ocl_util.cpp | 12 +----------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index fe3e4c25..60dfde75 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -3,6 +3,7 @@ #include "caffe/data_layers.hpp" #include "caffe/util/io.hpp" +#include "caffe/util/benchmark.hpp" namespace caffe { @@ -86,8 +87,12 @@ void BasePrefetchingDataLayer::Forward_cpu( template void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { + printf("HHHHHH Data forward time: n\n"); // First, join the thread JoinPrefetchThread(); + CPUTimer forward_timer; + forward_timer.Start(); + // Copy the data from prefetch thread to data_layer //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); top[0]->ReshapeLike(prefetch_data_); @@ -99,7 +104,10 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); } clFinish(amdDevice.CommandQueue); - + forward_timer.Stop(); + printf("Data forward time: %f\n\n", forward_timer.MilliSeconds()); + + #ifdef Track_data_transfer #endif diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 9869b33f..4d20cdd7 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -506,17 +506,21 @@ Dtype Net::ForwardFromTo(int start, int end) { } CPUTimer forward_timer; + CPUTimer layer_timer; forward_timer.Start(); for (int i = start; i <= end; ++i) { - // LOG(ERROR) << "Forwarding " << layer_names_[i]; -//Yibing add for porting - printf("Forwarding %s\n",layer_names_[i].c_str()); - Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); + //double begin_time = GettickCount(); + layer_timer.Start(); + //printf("Forwarding %s\n",layer_names_[i].c_str()); + Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); loss += layer_loss; if (debug_info_) { ForwardDebugInfo(i); } -//Yibing add for porting clFinish(amdDevice.CommandQueue); + //double end_time = GettickCount(); + layer_timer.Stop(); + //printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), end_time-begin_time); + printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); } forward_timer.Stop(); diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 8feead82..eef9f544 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -13,11 +13,6 @@ namespace caffe { template void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){ cl_int err=0; - //cl_kernel Kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", &err); - //if(NULL==Kernel){ - // fprintf(stderr, "Failed to create kernel %d\n", err); - //} - err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value); err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); @@ -35,12 +30,7 @@ template void ocl_memset(cl_kernel Kernel, double* buffer, const double void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){ - cl_int err=0; - // cl_kernel Kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); - // if(NULL==Kernel){ - // fprintf(stderr, "Failed to create kernel %d\n", err); - // } - + cl_int err=0; err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value); err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); From 0bf247998ff3c19f12aba682519db61299d031ad Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 4 Aug 2015 04:54:27 +0800 Subject: [PATCH 021/124] conv opt cleaning up cont. --- include/caffe/vision_layers.hpp | 12 ++++++---- src/caffe/layers/base_conv_layer.cpp | 31 +++++++++++++++++------- src/caffe/layers/conv_layer.cpp | 36 ++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index a9c644c2..e763d31a 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -93,15 +93,18 @@ class BaseConvolutionLayer : public Layer { inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0); - // im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, - // conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0); } inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); - // col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, - // kernel_h_, pad_h_, stride_h_, data, bottom_offset_); } + inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) { + im2col_gpu(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); + } + inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) { + col2im_gpu(col2im_opt_kernel, (Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); #endif int conv_out_channels_; @@ -114,6 +117,7 @@ class BaseConvolutionLayer : public Layer { int col_offset_; int output_offset_; int M_, N_, K_; + int opt_num2; Blob col_buffer_; Blob bias_multiplier_; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 4bb1d1e6..54796ae8 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -298,12 +298,6 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, } for (int g = 0; g < group_; ++g) { - /*caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype)0., output + output_offset_ * g); - */ - //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count()); caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, @@ -311,12 +305,31 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, } } +template +void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, + const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + } + col_buff = col_buffer_.gpu_data(); + } + + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, + conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, + (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, + (Dtype)0., output, top_offset_+output_offset_ * g); + } +} + + template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, const Dtype* bias) { - /*caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(), - (Dtype)1., output);*/ caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, height_out_*width_out_, 1, (Dtype)1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 77697023..9e863322 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -119,6 +119,42 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom } +template +void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, + const vector*>& top) { + + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + //int col_offset = K_ * N_; + //int top_offset = M_ * N_; + //int weight_offset = M_ * K_; + int opt_num2 = global_packing_N; + + for (int n = 0; n < this->num_; ++n) { + opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; + //two intermediate variables to pass offset + this->top_offset_ = M_ * N_ * opt_num2; + this->col_offset_ = K_ * N_ * opt_num2; + this->bottom_offset_ = bottom[i]->offset(n); + this->forward_gpu_gemm_opt(bottom_data, weight, + top_data); + + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } + + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + CHECK_BLOB_DATA(top[0],20, "top[0]"); + +} + template void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom, const vector*>& top) { From f3cd44851df30aecad4d081b32ba252f7d222ffd Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Mon, 3 Aug 2015 16:05:17 -0700 Subject: [PATCH 022/124] conv opt forward done --- include/caffe/util/im2col.hpp | 2 +- include/caffe/vision_layers.hpp | 27 +++++++++----- src/caffe/layers/base_conv_layer.cpp | 55 +++++++++++++++++++++------- src/caffe/layers/conv_layer.cpp | 14 ++++--- src/caffe/util/im2col.cpp | 7 ++-- 5 files changed, 70 insertions(+), 35 deletions(-) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 862a539b..5eb28f9a 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -53,7 +53,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int stride, Dtype* data_col, const int col_offset); template -void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col, const int col_offset, int optnum); diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index e763d31a..6ba4bfc5 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -59,6 +59,9 @@ class BaseConvolutionLayer : public Layer { void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* weights); void backward_gpu_bias(Dtype* bias, const Dtype* input); + void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); #endif // reverse_dimensions should return true iff we are implementing deconv, so @@ -99,12 +102,16 @@ class BaseConvolutionLayer : public Layer { kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); } inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) { - im2col_gpu(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, + im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); } inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col2im_opt_kernel, (Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); +} + inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { + transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); +} #endif int conv_out_channels_; @@ -113,11 +120,6 @@ class BaseConvolutionLayer : public Layer { int conv_in_height_; int conv_in_width_; int kernel_dim_; - int weight_offset_; - int col_offset_; - int output_offset_; - int M_, N_, K_; - int opt_num2; Blob col_buffer_; Blob bias_multiplier_; @@ -132,12 +134,15 @@ class BaseConvolutionLayer : public Layer { cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel; cl_kernel oclmem_kernel; cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; + int opt_num2; + int M_, N_, K_; + int weight_offset_; + int col_offset_; + int output_offset_; + int top_offset_, top_offset_n, bottom_offset_; public: static cl_mem subTopMem, transMem; static size_t subtop_mem_size, trans_mem_size; - -public: - size_t top_offset_, bottom_offset_; }; /** @@ -210,6 +215,8 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector& propagate_down, const vector*>& bottom); virtual void Forward_gpu_opt(const vector*>& bottom, const vector*>& top); + virtual void Forward_gpu_opt2(const vector*>& bottom, + const vector*>& top); virtual void Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 54796ae8..c6f24064 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -307,23 +307,40 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, template void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* weight, Dtype* output, bool skip_im2col) { const Dtype* col_buff = input; + cl_command_queue Queue; + cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data()); } - col_buff = col_buffer_.gpu_data(); + //col_buff = col_buffer_.gpu_data(); } - - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, - conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, - (Dtype)0., output, top_offset_+output_offset_ * g); - } + +#ifdef multiQ + for (int g = 0; g < group_; ++g) { + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; + prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + } + if(group_ == 2){ + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#else + Queue = amdDevice.CommandQueue; + for (int g = 0; g < group_; ++g) { + prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + } +#endif + + conv_transform_gpu((Dtype*)subTopMem, output); + } @@ -336,6 +353,16 @@ void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, (Dtype)1., output, top_offset_); } +template +void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, + const Dtype* bias) { + for (int z = 0; z < opt_num2; z++) + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, + N_, 1, (Dtype)1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype)1., output, top_offset_n + num_output_ * N_ * z); +} + template void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, const Dtype* weights, Dtype* input) { @@ -413,7 +440,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo col_offset = K_ * N_ * opt_num2; //step1: packed im2col, col_size = (K_ * group_ ) * N_ //this should be opt_num2 images packing together. - im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); //step 2: sgemm: Top (subTopMem) = weight * col_data @@ -496,7 +523,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t col_offset = K_ * (N_ * opt_num2); //step1: packed im2col, col_size = (K_ * group_ ) * N_ //this should be opt_num2 images packing together. - im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 9e863322..7a763dfb 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -132,20 +132,22 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto //int col_offset = K_ * N_; //int top_offset = M_ * N_; //int weight_offset = M_ * K_; - int opt_num2 = global_packing_N; + this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; ++n) { - opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; //two intermediate variables to pass offset - this->top_offset_ = M_ * N_ * opt_num2; - this->col_offset_ = K_ * N_ * opt_num2; + this->top_offset_ = this->M_ * this->N_ * this->opt_num2; + this->top_offset_n = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_ * this->opt_num2; this->bottom_offset_ = bottom[i]->offset(n); + this->weight_offset_ = this->M_ * this->K_; this->forward_gpu_gemm_opt(bottom_data, weight, top_data); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data, bias); + this->forward_gpu_bias_opt(top_data, bias); } } } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index b9257675..4d28ab1e 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -285,7 +285,7 @@ template void im2col_16_gpu(cl_kernel Kernel, const double* data_im, con const int stride, double* data_col, const int col_offset); template -void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col, const int col_offset, int optnum) { @@ -315,10 +315,10 @@ void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void im2col_opt_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, +template void im2col_gpu_opt(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, float* data_col, const int col_offset, int optnum); -template void im2col_opt_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, +template void im2col_gpu_opt(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, double* data_col, const int col_offset, int optnum); @@ -384,7 +384,6 @@ void im2col_gpu_ocl(cl_mem data_im, const int channels, ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col); OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) ); - //std::cout<<"num_kernels"< Date: Mon, 3 Aug 2015 17:52:24 -0700 Subject: [PATCH 023/124] conf opt backward interfaces --- include/caffe/vision_layers.hpp | 18 +++++-- src/caffe/layers/base_conv_layer.cpp | 66 +++++++++++++++++++++++-- src/caffe/layers/conv_layer.cpp | 73 ++++++++++++++++++++-------- 3 files changed, 130 insertions(+), 27 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 6ba4bfc5..233bf48f 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -53,15 +53,19 @@ class BaseConvolutionLayer : public Layer { #ifndef CPU_ONLY void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, Dtype* output, bool skip_im2col = false); + void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); void forward_gpu_bias(Dtype* output, const Dtype* bias); + void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); void backward_gpu_gemm(const Dtype* input, const Dtype* weights, Dtype* col_output); + void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, + Dtype* col_output); void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* weights); + void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype* + weights); void backward_gpu_bias(Dtype* bias, const Dtype* input); - void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); #endif // reverse_dimensions should return true iff we are implementing deconv, so @@ -111,6 +115,12 @@ class BaseConvolutionLayer : public Layer { } inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); +} + inline void conv_transpose_gpu(const Dtype* data){ + opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); +} + inline void ocl_memset(Dtype* data, Dtype value, int count) { + ocl_memset(oclmem_kernel, data, value, count); } #endif @@ -219,6 +229,8 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu_opt2(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index c6f24064..ebd65713 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -381,6 +381,38 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, } } +template +void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, + const Dtype* weights, Dtype* input) { + //Dtype* col_buff = col_buffer_.mutable_gpu_data(); + cl_command_queue Queue; + if (is_1x1_) { + (Dtype*)transMem = input; + } + for (int g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_, + (Dtype)1., weights, weight_offset_ * g, + (Dtype*)subTopMem, top_offset_ * g, + (Dtype)0., (Dtype*)transMem, col_offset_ * g); + } +#ifdef multiQ + if(group_ ==2){ + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#endif + + if (!is_1x1_) { + conv_col2im_gpu_opt((Dtype*)transMem, input); + } +} + template void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights) { @@ -390,16 +422,42 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, col_buff = col_buffer_.gpu_data(); } for (int g = 0; g < group_; ++g) { - /* caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype)1., weights + weight_offset_ * g);*/ caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype)1., output, top_offset_, (Dtype*)col_buff, col_offset_ * g, (Dtype)1., (Dtype*)weights, weight_offset_ * g); } } +template +void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + cl_command_queue Queue; + if (!is_1x1_) { + conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data()); + //col_buff = col_buffer_.gpu_data(); + } + conv_transpose_gpu(output); + + for (int g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, + (Dtype)1., (Dtype*)subTopMem, top_offset_ * g, + (Dtype*)transMem, col_offset_ * g, (Dtype)1., + (Dtype*)weights, weight_offset_ * g); +#ifdef multiQ + if(group_ == 2){ + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#endif + } +} template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 7a763dfb..34490f68 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -77,7 +77,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt(bottom, top); + Forward_gpu_opt2(bottom, top); else Forward_gpu_org(bottom, top); } @@ -97,22 +97,6 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom const Dtype* weight = this->blobs_[0]->gpu_data(); this->forward_gpu_opt(bottom, weight, top); -/* -#ifdef check_gradient - const Dtype *cpu_bottom_data = bottom[0]->cpu_data(); Dtype *cpu_top_data = (Dtype*)(*top)[0]->cpu_data(); - - printf("\n\nbottom data GPU:\n"); - for(int i=0; i::Forward_gpu_opt2(const vector*>& botto //CHECK_BLOB_DATA(bottom[i],10,"bottom"); Dtype* top_data = top[i]->mutable_gpu_data(); - //int col_offset = K_ * N_; - //int top_offset = M_ * N_; - //int weight_offset = M_ * K_; this->opt_num2 = global_packing_N; for (int n = 0; n < this->num_; n += this->opt_num2) { @@ -183,11 +164,63 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); CHECK_BLOB_DATA(top[0],20, "top[0]"); } + template void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { this->backward_gpu_opt(top, propagate_down, bottom); } + + +template +void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + this->ocl_memset(bias_diff, 0., this->blobs_[1]->count()); + for (int n = 0; n < this->num_; ++n) { + // + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + this->weight_offset_ = this->M_ * this->K_; + this->opt_num2 = global_packing_N; + for (int n = 0; n < this->num_; ++n) { + this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; + this->top_offset_n = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_ = this->M_ * (this->N * this->opt_num2); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm_opt(bottom_data, + top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm_opt(top_diff, weight, + bottom_diff); + } + } + } + } + + CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); + CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); + CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); + CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); +} template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { From 2fdb29af929e7023f520a2cd55be506a1d7b1b9d Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 4 Aug 2015 14:09:03 +0800 Subject: [PATCH 024/124] finished debugging for conv optimized scheme --- include/caffe/vision_layers.hpp | 3 ++- src/caffe/layers/base_conv_layer.cpp | 3 ++- src/caffe/layers/conv_layer.cpp | 4 ++-- src/caffe/util/math_functions.cpp | 4 ++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 233bf48f..6f306545 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -119,7 +119,8 @@ class BaseConvolutionLayer : public Layer { inline void conv_transpose_gpu(const Dtype* data){ opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); } - inline void ocl_memset(Dtype* data, Dtype value, int count) { +protected: + inline void gpu_memset(Dtype* data, Dtype value, int count) { ocl_memset(oclmem_kernel, data, value, count); } #endif diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index ebd65713..99643465 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -387,7 +387,8 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, //Dtype* col_buff = col_buffer_.mutable_gpu_data(); cl_command_queue Queue; if (is_1x1_) { - (Dtype*)transMem = input; + int count = height_ * width_ * conv_in_channels_ * opt_num2; + caffe_gpu_copy(count, input, (Dtype*)transMem); } for (int g = 0; g < group_; ++g) { #ifdef multiQ diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 34490f68..2dc65ac7 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -183,7 +183,7 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - this->ocl_memset(bias_diff, 0., this->blobs_[1]->count()); + this->gpu_memset(bias_diff, 0., this->blobs_[1]->count()); for (int n = 0; n < this->num_; ++n) { // this->top_offset_ = top[i]->offset(n); @@ -201,7 +201,7 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, this->top_offset_n = top[i]->offset(n); this->bottom_offset_ = bottom[i]->offset(n); this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_ = this->M_ * (this->N * this->opt_num2); + this->top_offset_ = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm_opt(bottom_data, diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 8bc16ea3..b877da50 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -241,13 +241,13 @@ void caffe_copy(const int N, const double* X, double* Y) { template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { if(X != Y) - CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasScopy( N * sizeof(float), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { if(X != Y) - CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasDcopy( N * sizeof(double), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> From 9a416709ac27ab960be180f5eba9689789e78209 Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 4 Aug 2015 14:19:12 +0800 Subject: [PATCH 025/124] minor change --- src/caffe/layers/conv_layer.cpp | 2 +- src/caffe/util/math_functions.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 2dc65ac7..342d7842 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -77,7 +77,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt2(bottom, top); + Forward_gpu_opt(bottom, top); else Forward_gpu_org(bottom, top); } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index b877da50..8bc16ea3 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -241,13 +241,13 @@ void caffe_copy(const int N, const double* X, double* Y) { template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { if(X != Y) - CLBLAS_CHECK( clblasScopy( N * sizeof(float), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { if(X != Y) - CLBLAS_CHECK( clblasDcopy( N * sizeof(double), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> From 472b84a5ac9aed901cd2f396c8a89f90a897288e Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 5 Aug 2015 13:34:43 -0700 Subject: [PATCH 026/124] conv layer clean up --- include/caffe/vision_layers.hpp | 9 +++++---- src/caffe/layers/conv_layer.cpp | 14 ++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 6f306545..8498cb58 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -125,6 +125,7 @@ class BaseConvolutionLayer : public Layer { } #endif +private: int conv_out_channels_; int conv_in_channels_; int conv_out_spatial_dim_; @@ -224,12 +225,12 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt(const vector*>& bottom, - const vector*>& top); + //virtual void Forward_gpu_opt(const vector*>& bottom, + // const vector*>& top); virtual void Forward_gpu_opt2(const vector*>& bottom, const vector*>& top); - virtual void Backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + //virtual void Backward_gpu_opt(const vector*>& top, + // const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 342d7842..1037a8cf 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -77,7 +77,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt(bottom, top); + Forward_gpu_opt2(bottom, top); else Forward_gpu_org(bottom, top); } @@ -86,11 +86,12 @@ template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N >1) - Backward_gpu_opt(top, propagate_down, bottom); + Backward_gpu_opt2(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); } +/* template void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom, const vector*>& top) { @@ -101,7 +102,7 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom LOG(WARNING) << "conv fp done"; #endif -} +}*/ template void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, @@ -117,7 +118,7 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; - //two intermediate variables to pass offset + //intermediate variables to pass offset this->top_offset_ = this->M_ * this->N_ * this->opt_num2; this->top_offset_n = top[i]->offset(n); this->col_offset_ = this->K_ * this->N_ * this->opt_num2; @@ -133,7 +134,7 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto } } - // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); CHECK_BLOB_DATA(top[0],20, "top[0]"); } @@ -165,12 +166,13 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom CHECK_BLOB_DATA(top[0],20, "top[0]"); } +/* template void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { this->backward_gpu_opt(top, propagate_down, bottom); } - +*/ template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, From 649b3abe716c281aef9e6d141c9c8cf4fb8c812c Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 6 Aug 2015 04:52:32 +0800 Subject: [PATCH 027/124] fixed the bug in syncedmem set_cpu_data --- include/caffe/util/math_functions.hpp | 2 +- src/caffe/OCL_kernel.cl | 1 + src/caffe/data_transformer.cpp | 5 ++- src/caffe/layers/base_conv_layer.cpp | 4 +-- src/caffe/layers/base_data_layer.cpp | 42 ++++++++++++++++-------- src/caffe/layers/inner_product_layer.cpp | 2 +- src/caffe/net.cpp | 1 + src/caffe/syncedmem.cpp | 12 +++++-- src/caffe/util/math_functions.cpp | 4 +-- 9 files changed, 48 insertions(+), 25 deletions(-) diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index c9a391ac..a5ca6470 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -55,7 +55,7 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, Dtype* y); template -void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, const Dtype * x, size_t offx, const Dtype beta, int incx, Dtype* y, size_t offy, int incy); diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 48076725..8ab1c711 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -742,6 +742,7 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size) buffer[gdx] = value; } } +template __attribute__ ((mangled_name(oclmem))) __kernel void OCL_memset2(__global int* buffer, const int value, const int size); template __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 2a3bc645..f6d80dc2 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -7,7 +7,7 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" - +#include "caffe/util/benchmark.hpp" namespace caffe { template @@ -24,7 +24,6 @@ DataTransformer::DataTransformer(const TransformationParameter& param, ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); data_mean_.FromProto(blob_proto); } - printf("before if\n"); // check if we want to use mean_value if (param_.mean_value_size() > 0) { CHECK(param_.has_mean_file() == false) << @@ -33,7 +32,6 @@ DataTransformer::DataTransformer(const TransformationParameter& param, mean_values_.push_back(param_.mean_value(c)); } } - printf("reaches here\n"); } template @@ -127,6 +125,7 @@ void DataTransformer::Transform(const Datum& datum, template void DataTransformer::Transform(const Datum& datum, Blob* transformed_blob) { + // If datum is encoded, decoded and transform the cv::image. if (datum.encoded()) { CHECK(!(param_.force_color() && param_.force_gray())) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 99643465..55046847 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -465,7 +465,7 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input) { /* caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., input, bias_multiplier_.gpu_data(), 1., bias);*/ - caffe_gpu_gemvv(CblasNoTrans, num_output_, height_out_*width_out_, + caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_*width_out_, (Dtype)1., input, top_offset_, height_out_*width_out_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias, (size_t)0, 1); @@ -553,7 +553,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { - caffe_gpu_gemvv(CblasNoTrans, M_, N_, + caffe_gpu_gemv(CblasNoTrans, M_, N_, (Dtype)1., top_diff, top[i]->offset(n), N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias_diff, (size_t)0, 1); diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 60dfde75..1b6e07fa 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -60,7 +60,13 @@ template void BasePrefetchingDataLayer::Forward_cpu( const vector*>& bottom, const vector*>& top) { // First, join the thread + CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer; + join_prefetch_timer.Start(); JoinPrefetchThread(); + join_prefetch_timer.Stop(); + printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds()); + + forward_timer.Start(); DLOG(INFO) << "Thread joined"; // Reshape to loaded data. top[0]->ReshapeLike(prefetch_data_); @@ -75,37 +81,42 @@ void BasePrefetchingDataLayer::Forward_cpu( caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), top[1]->mutable_cpu_data()); } - - CHECK_BLOB_DATA(top[0], 20, "top[0]"); - + forward_timer.Stop(); + printf("write buffer time: %f\n", forward_timer.MilliSeconds()); // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; + create_prefetch_timer.Start(); CreatePrefetchThread(); - + create_prefetch_timer.Stop(); + printf("create prefetch time: %f\n", create_prefetch_timer.MilliSeconds() ); } template void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { - printf("HHHHHH Data forward time: n\n"); - // First, join the thread + CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer; + + join_prefetch_timer.Start(); JoinPrefetchThread(); - CPUTimer forward_timer; - forward_timer.Start(); - + join_prefetch_timer.Stop(); + printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds()); // Copy the data from prefetch thread to data_layer //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); - top[0]->ReshapeLike(prefetch_data_); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); + + clFinish(amdDevice.CommandQueue); + forward_timer.Start(); + top[0]->ReshapeLike(this->prefetch_data_); + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); if (this->output_labels_) { // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); + top[1]->ReshapeLike(prefetch_label_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); } - clFinish(amdDevice.CommandQueue); + +// clFinish(amdDevice.CommandQueue); forward_timer.Stop(); - printf("Data forward time: %f\n\n", forward_timer.MilliSeconds()); + printf("Write buffer time: %f\n\n", forward_timer.MilliSeconds()); #ifdef Track_data_transfer @@ -115,7 +126,10 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; + create_prefetch_timer.Start(); CreatePrefetchThread(); + create_prefetch_timer.Stop(); + printf("create_prefetch time: %f\n", create_prefetch_timer.MilliSeconds()); //return Dtype(0.); } diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 4242afa3..676650c2 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -148,7 +148,7 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bias - caffe_gpu_gemvv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, + caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, (size_t)0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)0., 1, this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 4d20cdd7..df376ff0 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -35,6 +35,7 @@ Net::Net(const string& param_file, Phase phase) { template void Net::Init(const NetParameter& in_param) { // Set phase from the state. + amdDevice.Init(); phase_ = in_param.state().phase(); // Filter layers based on their include/exclude rules and // the current NetState. diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index e98e6847..a44641ef 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -36,7 +36,7 @@ if (cpu_ptr_ && own_cpu_data_) { void SyncedMemory::ocl_setup() { cl_int err=0; - oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + oclmem_kernel = clCreateKernel(amdDevice.Program, "memset", &err); OCL_CHECK(err); } @@ -125,7 +125,7 @@ const void* SyncedMemory::cpu_data() { } void SyncedMemory::set_cpu_data(void* data) { -CHECK(data); +/*CHECK(data); if (own_cpu_data_) { OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL)); OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_)); @@ -135,6 +135,14 @@ CHECK(data); cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); head_ = HEAD_AT_CPU; own_cpu_data_ = false; +*/ + CHECK(data); + if (own_cpu_data_) { + CaffeFreeHost(cpu_ptr_); + } + cpu_ptr_ = data; + head_ = HEAD_AT_CPU; + own_cpu_data_ = false; } const void* SyncedMemory::gpu_data() { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 8bc16ea3..f4ac6617 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -137,7 +137,7 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, } template <> -void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, size_t offA, int lda, const float* x, size_t offx, const float beta, int incx, float* y, size_t offy, int incy) { @@ -150,7 +150,7 @@ void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, } template <> -void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M, +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double* A, size_t offA, int lda, const double* x, size_t offx, const double beta, int incx, double* y, size_t offy, int incy) { From b204a85ca226acee4c63bb16fb50ceb8cf324730 Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 6 Aug 2015 11:13:11 +0800 Subject: [PATCH 028/124] gconv opt debug --- include/caffe/vision_layers.hpp | 12 ++++++------ src/caffe/OCL_kernel.cl | 1 - src/caffe/layers/base_conv_layer.cpp | 10 +++------- src/caffe/layers/conv_layer.cpp | 14 ++++---------- src/caffe/net.cpp | 2 +- src/caffe/syncedmem.cpp | 2 +- src/caffe/util/math_functions.cpp | 2 -- 7 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 8498cb58..336127d5 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -105,11 +105,11 @@ class BaseConvolutionLayer : public Layer { col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); } - inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) { + inline void conv_im2col_gpu_opt(const Dtype* data) { im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); } - inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) { + inline void conv_col2im_gpu_opt( Dtype* data) { col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); } @@ -225,12 +225,12 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - //virtual void Forward_gpu_opt(const vector*>& bottom, - // const vector*>& top); + virtual void Forward_gpu_opt(const vector*>& bottom, + const vector*>& top); virtual void Forward_gpu_opt2(const vector*>& bottom, const vector*>& top); - //virtual void Backward_gpu_opt(const vector*>& top, - // const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu_opt(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl index 8ab1c711..48076725 100644 --- a/src/caffe/OCL_kernel.cl +++ b/src/caffe/OCL_kernel.cl @@ -742,7 +742,6 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size) buffer[gdx] = value; } } -template __attribute__ ((mangled_name(oclmem))) __kernel void OCL_memset2(__global int* buffer, const int value, const int size); template __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 55046847..8edecdc0 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -308,16 +308,14 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, template void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { - const Dtype* col_buff = input; cl_command_queue Queue; cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data()); + conv_im2col_gpu_opt(input); } //col_buff = col_buffer_.gpu_data(); } - #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; @@ -338,9 +336,7 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); } #endif - conv_transform_gpu((Dtype*)subTopMem, output); - } @@ -410,7 +406,7 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #endif if (!is_1x1_) { - conv_col2im_gpu_opt((Dtype*)transMem, input); + conv_col2im_gpu_opt(input); } } @@ -435,7 +431,7 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* col_buff = input; cl_command_queue Queue; if (!is_1x1_) { - conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data()); + conv_im2col_gpu_opt(input); //col_buff = col_buffer_.gpu_data(); } conv_transpose_gpu(output); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 1037a8cf..48b7afe9 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -77,7 +77,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt2(bottom, top); + Forward_gpu_opt(bottom, top); else Forward_gpu_org(bottom, top); } @@ -86,12 +86,11 @@ template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N >1) - Backward_gpu_opt2(top, propagate_down, bottom); + Backward_gpu_opt(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); } -/* template void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom, const vector*>& top) { @@ -102,12 +101,11 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom LOG(WARNING) << "conv fp done"; #endif -}*/ +} template void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); @@ -115,7 +113,6 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto Dtype* top_data = top[i]->mutable_gpu_data(); this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; //intermediate variables to pass offset @@ -126,12 +123,11 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto this->weight_offset_ = this->M_ * this->K_; this->forward_gpu_gemm_opt(bottom_data, weight, top_data); - if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->gpu_data(); this->forward_gpu_bias_opt(top_data, bias); } - } + } } CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); @@ -166,13 +162,11 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom CHECK_BLOB_DATA(top[0],20, "top[0]"); } -/* template void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { this->backward_gpu_opt(top, propagate_down, bottom); } -*/ template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index df376ff0..ad6bdc7e 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -35,7 +35,7 @@ Net::Net(const string& param_file, Phase phase) { template void Net::Init(const NetParameter& in_param) { // Set phase from the state. - amdDevice.Init(); + //amdDevice.Init(); phase_ = in_param.state().phase(); // Filter layers based on their include/exclude rules and // the current NetState. diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index a44641ef..ac1187b9 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -36,7 +36,7 @@ if (cpu_ptr_ && own_cpu_data_) { void SyncedMemory::ocl_setup() { cl_int err=0; - oclmem_kernel = clCreateKernel(amdDevice.Program, "memset", &err); + oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); OCL_CHECK(err); } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index f4ac6617..54e0abdc 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -102,7 +102,6 @@ cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE Tr int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; - //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) ); CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); return event; } @@ -117,7 +116,6 @@ cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE T int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; - //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) ); CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); return event; } From 4c4b9d33eade5699a5c16870cd4c06ecdfd6bdd0 Mon Sep 17 00:00:00 2001 From: Yuan Date: Sat, 8 Aug 2015 05:18:22 +0800 Subject: [PATCH 029/124] Split OpenCL kernels into different files --- include/caffe/device.hpp | 4 +- src/caffe/OCL_kernel.cl | 1837 ------------------------ src/caffe/device.cpp | 29 +- src/caffe/ocl/OCL_kernel.cl | 999 +++++++++++++ src/caffe/ocl/dropout_layer.cl | 18 + src/caffe/ocl/im2col.cl | 298 ++++ src/caffe/ocl/lrn_layer.cl | 113 ++ src/caffe/ocl/pooling_layer.cl | 267 ++++ src/caffe/ocl/relu_layer.cl | 20 + src/caffe/ocl/softmax_layer.cl | 48 + src/caffe/ocl/softmaxwithloss_layer.cl | 65 + 11 files changed, 1852 insertions(+), 1846 deletions(-) delete mode 100644 src/caffe/OCL_kernel.cl create mode 100644 src/caffe/ocl/OCL_kernel.cl create mode 100644 src/caffe/ocl/dropout_layer.cl create mode 100644 src/caffe/ocl/im2col.cl create mode 100644 src/caffe/ocl/lrn_layer.cl create mode 100644 src/caffe/ocl/pooling_layer.cl create mode 100644 src/caffe/ocl/relu_layer.cl create mode 100644 src/caffe/ocl/softmax_layer.cl create mode 100644 src/caffe/ocl/softmaxwithloss_layer.cl diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 7360dacd..0b534e57 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -26,13 +26,13 @@ class Device{ cl_int Init(); - cl_int ConvertToString(const char *pFileName,std::string &Str); + cl_int ConvertToString(std::string pFileName,std::string &Str); void DisplayPlatformInfo(); void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); void GetDeviceInfo(); - cl_program BuildProgram(const char*); + cl_program BuildProgram(std::string); template void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str); diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl deleted file mode 100644 index 48076725..00000000 --- a/src/caffe/OCL_kernel.cl +++ /dev/null @@ -1,1837 +0,0 @@ -#pragma OPENCL EXTENSION cl_amd_printf : enable - -//beginning of the looooooong gpu_random_generator kernel -//we use the open sourced threefry's GPU implementation -typedef uint uint32_t; - -struct r123array4x32 { uint32_t v[4]; }; - -enum r123_enum_threefry32x4 -{ - R_32x4_0_0 = 10, R_32x4_0_1 = 26, - R_32x4_1_0 = 11, R_32x4_1_1 = 21, - R_32x4_2_0 = 13, R_32x4_2_1 = 27, - R_32x4_3_0 = 23, R_32x4_3_1 = 5, - R_32x4_4_0 = 6, R_32x4_4_1 = 20, - R_32x4_5_0 = 17, R_32x4_5_1 = 11, - R_32x4_6_0 = 25, R_32x4_6_1 = 10, - R_32x4_7_0 = 18, R_32x4_7_1 = 20 -}; - -inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); -inline uint32_t RotL_32(uint32_t x, unsigned int N) -{ - return (x << (N & 31)) | (x >> ((32 - N) & 31)); -} - -typedef struct r123array4x32 threefry4x32_ctr_t; -typedef struct r123array4x32 threefry4x32_key_t; -typedef struct r123array4x32 threefry4x32_ukey_t; - -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) -{ - threefry4x32_ctr_t X; - uint32_t ks[4 + 1]; - int i; - ks[4] = 0x1BD11BDA; - /* - for (i = 0; i < 4; i++) - { - ks[i] = k.v[i]; - X.v[i] = in.v[i]; - ks[4] ^= k.v[i]; - }*/ - { - ks[0] = k.v[0]; - X.v[0] = in.v[0]; - ks[4] ^= k.v[0]; - - ks[1] = k.v[1]; - X.v[1] = in.v[1]; - ks[4] ^= k.v[1]; - - ks[2] = k.v[2]; - X.v[2] = in.v[2]; - ks[4] ^= k.v[2]; - - ks[3] = k.v[3]; - X.v[3] = in.v[3]; - ks[4] ^= k.v[3]; - } - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - if (Nrounds > 0) - { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 1) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 2) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 3) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 3) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 1; - } if (Nrounds > 4) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 5) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 6) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 7) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 7) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 2; - } if (Nrounds > 8) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 9) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 10) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 11) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 11) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 3; - } if (Nrounds > 12) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 13) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 14) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 15) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 15) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 4; - } if (Nrounds > 16) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 17) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 18) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 19) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 19) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 5; - } if (Nrounds > 20) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 21) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 22) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 23) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 23) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 6; - } if (Nrounds > 24) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 25) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 26) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 27) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 27) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 7; - } if (Nrounds > 28) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 29) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 30) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 31) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 31) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 8; - } if (Nrounds > 32) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 33) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 34) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 35) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 35) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 9; - } if (Nrounds > 36) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 37) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 38) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 39) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 39) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 10; - } if (Nrounds > 40) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 41) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 42) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 43) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 43) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 11; - } if (Nrounds > 44) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 45) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 46) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 47) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 47) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 12; - } if (Nrounds > 48) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 49) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 50) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 51) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 51) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 13; - } if (Nrounds > 52) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 53) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 54) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 55) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 55) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 14; - } if (Nrounds > 56) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 57) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 58) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 59) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 59) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 15; - } if (Nrounds > 60) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 61) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 62) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 63) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 63) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 16; - } if (Nrounds > 64) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 65) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 66) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 67) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 67) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 17; - } if (Nrounds > 68) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 69) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 70) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 71) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 71) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 18; - } - return X; -} - -template -__kernel void PRNG_threefry4x32( - __global uint4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T inf, - T sup, - T threshold, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); - - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; - - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - - threefry4x32_ctr_t random4; - - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - uint4 frnd; - - frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - - randomnumber[gdx] = frnd; - } -} - - -template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); - -template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); - -//end of the looooooong gpu_random_generator kernel - - -template -__kernel void OCL_memset(__global T* buffer, const T value, const int size){ - int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; - } -} - -template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); -template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); - -__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ - int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; - } -} - -template -__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ - int gdx = get_global_id(0); - if(gdx < N){ - Y[gdx] =((0.0 -__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ - int index=get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; - if(index < n){ - int w_out=index %width_col; - index /= width_col; - int h_out=index%height_col; - int channel_in = index/height_col; - int channel_out=channel_in *ksize *ksize; - int h_in = h_out *stride-pad; - int w_in = w_out *stride-pad; - data_col +=(channel_out *height_col + h_out) *width_col + w_out; - data_im +=(channel_in * height + h_in) *width + w_in; - int i=0,j=0; - for(i=0;i= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col +=height_col *width_col; - } - } - } -} - -template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); - -template -__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){ - - int index = get_global_id(0); - - data_im = data_im + img_offset; - data_col = data_col + col_offset; - - int x_out = index % width_col; - int y_out = (index / width_col) % height_col; - int channel_in = (index / width_col / height_col) % channels; - int channel_out = channel_in * ksize * ksize; - int im_id = index / width_col / height_col / channels; - - int y_in = y_out * stride - pad; - int x_in = x_out * stride - pad; - int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; - int offset_im = im_id * channels * height * width + channel_in * height * width; - - for(int k_h = 0; k_h < ksize; k_h++){ - for(int k_w = 0; k_w < ksize; k_w++){ - int x_im = x_in + k_w; - int y_im = y_in + k_h; - int index_im = y_im * width + x_im; - int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; - if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) - data_col[offset_col + index_col] = data_im[offset_im + index_im]; - else - data_col[offset_col + index_col] = 0; - } - } -} - -template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); -template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); - - -template -__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_col, const int col_offset) { - data_im = data_im + img_offset; - data_col = data_col + col_offset; - - int index = get_global_id(0); - if(index < n) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - __global T* data_col_ptr = data_col; - data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; - __global const T* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + i; - int w = w_in + j; - *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; - data_col_ptr += height_col * width_col; - } - } - } -} - -template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel(const int n, __global const float* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2col_gpu_double_kernel))) void im2col_gpu_kernel(const int n, __global const double* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global double* data_col, const int col_offset); - -template -__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_im, const int img_offset) { - data_col = data_col + col_offset; - data_im = data_im + img_offset; - int index = get_global_id(0); - if(index < n) { - T val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; - int w_col_end = min(w / stride_w + 1, width_col); - int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; - int h_col_end = min(h / stride_h + 1, height_col); - // equivalent implementation - int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; - int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} - -template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w,const int pad_h, const int pad_w, - const int stride_h, const int stride_w,const int height_col, const int width_col, - __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, - const int col_offset, const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); - -template -__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){ - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} -template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); - -template -__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index;index= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col += height_col *width_col; - } - } - } -} - -template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); -template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); - -template -__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){ - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height) % channels; - int im = index / width / height / channels; - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col * optnum); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} -template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); -template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); - - -template -__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < n; index += tmp){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} -template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); -template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); - -template -__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){ - - int index = get_global_id(0); - data_opt = data_opt + opt_offset; - data_im = data_im + im_offset; - if(index < n){ - int w = index % width; - int h = (index / width) % height; - int c = index / (width * height) % channels; - int im = index / width / height / channels; - - int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; - data_opt[opt_index] = data_im[index]; - } -} -template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); -template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); - - -template -__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp){ - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - T maxval = -FLT_MAX; - int maxidx = -1; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_data[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_data[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } -} -template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); -template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); - -template -__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp){ - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - T aveval = 0; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_data[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } - -} -template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); -template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); - -template -__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp){ - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - T cumsum = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_data[h * width + w]; - return; - } - } - } - } -} -template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); - -template -__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < count; index+=tmp){ - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; - T cumsum = FLT_MIN; - T cumvalues = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum; } -} -template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); - -template -__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, - __global int* mask, __global T* top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, __global T* const bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - T gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - top_diff += offset; - if (mask) { - mask = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } else { - top_mask = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } -} -template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); -template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); - -template -__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){ - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = (index / width / height) % channels; - int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - T gradient = 0; - top_diff += (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } -} - -template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); -template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); - -template -void StoPoolBackward(const int nthreads, - __global Dtype* rand_idx, __global Dtype* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global Dtype* bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - rand_idx = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - top_diff = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff[ph * pooled_width + pw] * - (index == static_cast(rand_idx[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; - - } -} -template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* bottom_diff); -template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* bottom_diff); - -template -__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ - int index = get_global_id(0); - if(index < count) - out[index] = in[index] > 0? in[index]:in[index]*negative_slope; -} - -//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); - -template -__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ - int index = get_global_id(0); - if(index < count) - out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; -} - -template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); -template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); - -template -__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) { - T maxval = -FLT_MAX; - for (int i = 0; i < dim; i++) - maxval = max( data[index*dim + i], maxval ); - out[index] = maxval; - } -} - -template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); -template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); - -template -__kernel void exp (const int num, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) - out[index] = exp(data[index]); -} - -template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); -template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); - -template -__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){ - //printf("softmax_div\n"); - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num*dim; index += total){ - int n = index / dim; - data[index] /= scale[n]; - } -} - -template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); -template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data); - -template -__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){ - - int gid = get_global_id(0); - int size = get_global_size(0); - - resultScratch[gid] = 0.0; - for(int i = gid; i < num; i += size){ - resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(gid < 128) - resultScratch[gid] += resultScratch[gid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - if(gid < 64) - resultScratch[gid] += resultScratch[gid + 64]; - if(gid < 32) - resultScratch[gid] += resultScratch[gid + 32]; - if(gid < 16) - resultScratch[gid] += resultScratch[gid + 16]; - if(gid < 8) - resultScratch[gid] += resultScratch[gid + 8]; - if(gid < 4) - resultScratch[gid] += resultScratch[gid + 4]; - if(gid < 2) - resultScratch[gid] += resultScratch[gid + 2]; - if(gid < 1){ - resultScratch[gid] += resultScratch[gid + 1]; - loss[0] = resultScratch[gid]; - } - -} - -template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); -template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); - -template -__kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } -} - -template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* out); -template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* out); - -template -__kernel void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_max, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } -} - -template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); -template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); - -template -__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = exp(data[index]); - } -} - -template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); -template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); - -template -__kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* channel_sum) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } -} - -template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* channel_sum); -template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* channel_sum); - -template -__kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_sum, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } -} - -template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const float* channel_sum, __global float* data); -template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const double* channel_sum, __global double* data); - -template -__kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const T* data_1, __global const T* data_2, - __global T* channel_dot) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } -} - -template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const float* data_1, __global const float* data_2, - __global float* channel_dot); -template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const double* data_1, __global const double* data_2, - __global double* channel_dot); - - -template -__kernel void SoftmaxLossForwardGPU(const int nthreads, - __global T* prob_data, __global T* label,__global T* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global T* counts) { - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - T(FLT_MIN))); - counts[index] = 1; - } - } -} - -template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global float* prob_data, __global float* label,__global float* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global float* counts); -template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global double* prob_data, __global double* label,__global double* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global double* counts); - -template -__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, - __global T* label,__global T* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, T* counts) { - const int channels = dim / spatial_dim; - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } -} - - -template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, - __global float* label,__global float* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, float* counts); - -template __attribute__ ((mangled_name(softmax_loss_bp_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, - __global double* label,__global double* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, double* counts); - - -template -__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ - int index = get_global_id(0); - int total = get_global_size(0); - int offset; - for(index; index < num; index += total){ - offset = (int) label[index]; - data[index * dim + offset] -= 1; - } -} - -template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); -template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); - -template -__kernel void scal (const int num, const T alpha, __global T* data){ - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num; index += total){ - data[index] = data[index] * alpha; - } -} - -template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); -template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); - -template -__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] = a[index] / b[index]; -} - -template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); -//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); - -template -__kernel void add_scalar (const int n, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] += alpha; -} - -template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); -template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); - -template -__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ - int index = get_global_id(0); - if (index < n) - y[index] = in1[index] + in2[index] ; -} -template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); -template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); - -template -__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] = a[index] * b[index]; -} - -template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); -template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); - - -template -__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) -// y[index] = a[index] + alpha; - y[index] = pow(a[index], alpha); -} - -template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); -template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); - -template -__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){ - int index = get_global_id(0); - if (index < n) - out[index] = in[index] * scale * mask[index]; -} -template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); -template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); - - -template -__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){ - int index = get_global_id(0); - if (index < n) - out_diff[index] = in_diff[index] * scale * mask[index]; -} -template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); -template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); - -template -__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - in = in + offset; - scale = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in[head * step] * in[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in[head * step] * in[head * step]; - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } -} - -template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); -template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); - -template -__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) - out[index] = in[index] * pow(scale[index], negative_beta); -} -template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); -template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); - -template -__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - bottom_data += offset; - top_data += offset; - scale += offset; - top_diff += offset; - bottom_diff += offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } -} -} - -template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); -template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); - -template -__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidyy = gidy; - int index = gidy / height; - int offset = index * width * height; - gidy = gidy % height; - if( gidx < width && gidyy < height * optnum ) - dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; -} -template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); -template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); - -template -__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ - int gidx = get_global_id(0); - int index; - index = (optnum==1) ? 0: gidx % optnum; - dst = dst + top_offset; // now we point at (*top)[n] - int offset = gidx / optnum; - int i = 0; - for(i = 0 ; i < width; i++) - dst[(index * height + offset)* width + i] = src[gidx * width + i]; -} -template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); -template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 0e98ada0..3beba234 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -125,11 +125,25 @@ cl_int Device::Init(){ return 0; } - //Read our own kernel file - const char *pFileName = "./src/caffe/OCL_kernel.cl"; - const char *pSource; std::string strSource = ""; - ConvertToString(pFileName, strSource); + + std::string pFileName[8]; + pFileName[0] = "./src/caffe/ocl/OCL_kernel.cl"; + pFileName[1] = "./src/caffe/ocl/lrn_layer.cl"; + pFileName[2] = "./src/caffe/ocl/pooling_layer.cl"; + pFileName[3] = "./src/caffe/ocl/dropout_layer.cl"; + pFileName[4] = "./src/caffe/ocl/relu_layer.cl"; + pFileName[5] = "./src/caffe/ocl/softmax_layer.cl"; + pFileName[6] = "./src/caffe/ocl/softmaxwithloss_layer.cl"; + pFileName[7] = "./src/caffe/ocl/im2col.cl"; + + for(int fileNum = 0; fileNum < 8; fileNum++) { + std::string tmpSource = ""; + ConvertToString(pFileName[fileNum], tmpSource); + strSource += tmpSource; + } + + const char *pSource; pSource = strSource.c_str(); size_t uiArrSourceSize[] = {0}; uiArrSourceSize[0] = strlen(pSource); @@ -206,11 +220,12 @@ cl_int Device::Init(){ //Use to read OpenCL source code -cl_int Device::ConvertToString(const char *pFileName,std::string &Str){ +cl_int Device::ConvertToString(std::string pFileName,std::string &Str){ size_t uiSize=0; size_t uiFileSize=0; char *pStr=NULL; - std::fstream fFile(pFileName,(std::fstream::in|std::fstream::binary)); + char *tmp = (char*)pFileName.data(); + std::fstream fFile(tmp,(std::fstream::in|std::fstream::binary)); if(fFile.is_open()){ fFile.seekg(0,std::fstream::end); uiSize=uiFileSize=(size_t)fFile.tellg(); @@ -232,7 +247,7 @@ cl_int Device::ConvertToString(const char *pFileName,std::string &Str){ return -1; } -cl_program Device::BuildProgram(const char *pFileName) +cl_program Device::BuildProgram(std::string pFileName) { //Read our own kernel file const char *pSource; diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl new file mode 100644 index 00000000..7014721b --- /dev/null +++ b/src/caffe/ocl/OCL_kernel.cl @@ -0,0 +1,999 @@ +#pragma OPENCL EXTENSION cl_amd_printf : enable + +//beginning of the looooooong gpu_random_generator kernel +//we use the open sourced threefry's GPU implementation +typedef uint uint32_t; + +struct r123array4x32 { uint32_t v[4]; }; + +enum r123_enum_threefry32x4 +{ + R_32x4_0_0 = 10, R_32x4_0_1 = 26, + R_32x4_1_0 = 11, R_32x4_1_1 = 21, + R_32x4_2_0 = 13, R_32x4_2_1 = 27, + R_32x4_3_0 = 23, R_32x4_3_1 = 5, + R_32x4_4_0 = 6, R_32x4_4_1 = 20, + R_32x4_5_0 = 17, R_32x4_5_1 = 11, + R_32x4_6_0 = 25, R_32x4_6_1 = 10, + R_32x4_7_0 = 18, R_32x4_7_1 = 20 +}; + +inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); +inline uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32 - N) & 31)); +} + +typedef struct r123array4x32 threefry4x32_ctr_t; +typedef struct r123array4x32 threefry4x32_key_t; +typedef struct r123array4x32 threefry4x32_ukey_t; + +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) +{ + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; + ks[4] = 0x1BD11BDA; + /* + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ + { + ks[0] = k.v[0]; + X.v[0] = in.v[0]; + ks[4] ^= k.v[0]; + + ks[1] = k.v[1]; + X.v[1] = in.v[1]; + ks[4] ^= k.v[1]; + + ks[2] = k.v[2]; + X.v[2] = in.v[2]; + ks[4] ^= k.v[2]; + + ks[3] = k.v[3]; + X.v[3] = in.v[3]; + ks[4] ^= k.v[3]; + } + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + if (Nrounds > 0) + { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 1) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 2) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 3) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 3) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 1; + } if (Nrounds > 4) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 5) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 6) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 7) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 7) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 2; + } if (Nrounds > 8) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 9) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 10) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 11) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 11) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 3; + } if (Nrounds > 12) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 13) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 14) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 15) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 15) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 4; + } if (Nrounds > 16) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 17) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 18) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 19) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 19) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 5; + } if (Nrounds > 20) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 21) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 22) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 23) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 23) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 6; + } if (Nrounds > 24) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 25) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 26) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 27) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 27) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 7; + } if (Nrounds > 28) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 29) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 30) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 31) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 31) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 8; + } if (Nrounds > 32) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 33) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 34) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 35) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 35) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 9; + } if (Nrounds > 36) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 37) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 38) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 39) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 39) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 10; + } if (Nrounds > 40) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 41) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 42) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 43) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 43) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 11; + } if (Nrounds > 44) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 45) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 46) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 47) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 47) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 12; + } if (Nrounds > 48) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 49) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 50) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 51) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 51) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 13; + } if (Nrounds > 52) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 53) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 54) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 55) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 55) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 14; + } if (Nrounds > 56) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 57) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 58) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 59) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 59) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 15; + } if (Nrounds > 60) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 61) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 62) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 63) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 63) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 16; + } if (Nrounds > 64) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 65) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 66) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 67) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 67) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 17; + } if (Nrounds > 68) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 69) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 70) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } if (Nrounds > 71) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } if (Nrounds > 71) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 18; + } + return X; +} + +template +__kernel void PRNG_threefry4x32( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom +){ + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } +} + + +template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); + +//end of the looooooong gpu_random_generator kernel + + +template +__kernel void OCL_memset(__global T* buffer, const T value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); +template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); + +__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template +__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ + int gdx = get_global_id(0); + if(gdx < N){ + Y[gdx] =((0.0 +__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); +template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); + +template +__kernel void exp (const int num, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); +} + +template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); +template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); + + + +template +__kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); + +template +__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } +} + +template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* channel_sum); +template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* channel_sum); + +template +__kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); + +template +__kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } +} + +template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); +template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); + + + +template +__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total){ + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } +} + +template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); +template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); + +template +__kernel void scal (const int num, const T alpha, __global T* data){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total){ + data[index] = data[index] * alpha; + } +} + +template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); +template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); + +template +__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] / b[index]; +} + +template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); +//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); + +template +__kernel void add_scalar (const int n, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] += alpha; +} + +template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); +template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); + +template +__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index] ; +} +template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); +template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); + +template +__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; +} + +template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); +template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); + + +template +__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) +// y[index] = a[index] + alpha; + y[index] = pow(a[index], alpha); +} + +template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); +template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); + + +template +__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; +} +template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); + +template +__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0 ; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; +} +template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl new file mode 100644 index 00000000..3b1c479b --- /dev/null +++ b/src/caffe/ocl/dropout_layer.cl @@ -0,0 +1,18 @@ +template +__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){ + int index = get_global_id(0); + if (index < n) + out[index] = in[index] * scale * mask[index]; +} +template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); + + +template +__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){ + int index = get_global_id(0); + if (index < n) + out_diff[index] = in_diff[index] * scale * mask[index]; +} +template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl new file mode 100644 index 00000000..577dd58f --- /dev/null +++ b/src/caffe/ocl/im2col.cl @@ -0,0 +1,298 @@ +template +__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ + int index=get_global_id(0); + data_im = data_im + img_offset; + data_col = data_col + col_offset; + if(index < n){ + int w_out=index %width_col; + index /= width_col; + int h_out=index%height_col; + int channel_in = index/height_col; + int channel_out=channel_in *ksize *ksize; + int h_in = h_out *stride-pad; + int w_in = w_out *stride-pad; + data_col +=(channel_out *height_col + h_out) *width_col + w_out; + data_im +=(channel_in * height + h_in) *width + w_in; + int i=0,j=0; + for(i=0;i= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col +=height_col *width_col; + } + } + } +} + +template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); + +template +__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){ + + int index = get_global_id(0); + + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int x_out = index % width_col; + int y_out = (index / width_col) % height_col; + int channel_in = (index / width_col / height_col) % channels; + int channel_out = channel_in * ksize * ksize; + int im_id = index / width_col / height_col / channels; + + int y_in = y_out * stride - pad; + int x_in = x_out * stride - pad; + int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; + int offset_im = im_id * channels * height * width + channel_in * height * width; + + for(int k_h = 0; k_h < ksize; k_h++){ + for(int k_w = 0; k_w < ksize; k_w++){ + int x_im = x_in + k_w; + int y_im = y_in + k_h; + int index_im = y_im * width + x_im; + int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) + data_col[offset_col + index_col] = data_im[offset_im + index_im]; + else + data_col[offset_col + index_col] = 0; + } + } +} + +template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); + + +template +__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int index = get_global_id(0); + if(index < n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + __global T* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + __global const T* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel(const int n, __global const float* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2col_gpu_double_kernel))) void im2col_gpu_kernel(const int n, __global const double* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); + +template +__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { + data_col = data_col + col_offset; + data_im = data_im + img_offset; + int index = get_global_id(0); + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} + +template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + +template +__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){ + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); + +template +__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index;index= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col += height_col *width_col; + } + } + } +} + +template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); +template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); + +template +__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){ + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col * optnum); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); + + +template +__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < n; index += tmp){ + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); +template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); + +template +__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){ + + int index = get_global_id(0); + data_opt = data_opt + opt_offset; + data_im = data_im + im_offset; + if(index < n){ + int w = index % width; + int h = (index / width) % height; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + + int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; + data_opt[opt_index] = data_im[index]; + } +} +template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl new file mode 100644 index 00000000..901b5b13 --- /dev/null +++ b/src/caffe/ocl/lrn_layer.cl @@ -0,0 +1,113 @@ +template +__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) + out[index] = in[index] * pow(scale[index], negative_beta); +} +template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); +template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); + +template +__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + in = in + offset; + scale = scale + offset; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + } +} +template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); + +template +__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_ratio = 0; + // accumulate values + while (head < post_pad && head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } +} +} + +template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl new file mode 100644 index 00000000..5ac4bd52 --- /dev/null +++ b/src/caffe/ocl/pooling_layer.cl @@ -0,0 +1,267 @@ +template +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp){ + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_data[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_data[maxidx]; + } + } + } + top_data[index] = maxval; + if (mask) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } +} +template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); +template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); + +template +__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp){ + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + T aveval = 0; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_data[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } + +} +template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); +template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); + +template +__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp){ + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + T cumsum = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + } + } + const float thres = rand_idx[index] * cumsum; + // Second pass: get value, and set index. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + if (cumsum >= thres) { + rand_idx[index] = ((n * channels + c) * height + h) * width + w; + top_data[index] = bottom_data[h * width + w]; + return; + } + } + } + } +} +template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); + +template +__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < count; index+=tmp){ + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; + T cumsum = FLT_MIN; + T cumvalues = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum; } +} +template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); + +template +__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, + __global int* mask, __global T* top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + T gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + if (mask) { + mask = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } else { + top_mask = top_mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } +} +template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); + +template +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } + } + bottom_diff[index] = gradient; + } +} + +template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); + +template +void StoPoolBackward(const int nthreads, + __global Dtype* rand_idx, __global Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global Dtype* bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total){ + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + rand_idx = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + top_diff = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (index == static_cast(rand_idx[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; + + } +} +template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward(const int nthreads, + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl new file mode 100644 index 00000000..0d8d3b4e --- /dev/null +++ b/src/caffe/ocl/relu_layer.cl @@ -0,0 +1,20 @@ +template +__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ + int index = get_global_id(0); + if(index < count) + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; +} + +//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); + +template +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ + int index = get_global_id(0); + if(index < count) + out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; +} + +template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl new file mode 100644 index 00000000..711e4334 --- /dev/null +++ b/src/caffe/ocl/softmax_layer.cl @@ -0,0 +1,48 @@ +template +__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){ + + int gid = get_global_id(0); + int size = get_global_size(0); + + resultScratch[gid] = 0.0; + for(int i = gid; i < num; i += size){ + resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gid < 128) + resultScratch[gid] += resultScratch[gid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + if(gid < 64) + resultScratch[gid] += resultScratch[gid + 64]; + if(gid < 32) + resultScratch[gid] += resultScratch[gid + 32]; + if(gid < 16) + resultScratch[gid] += resultScratch[gid + 16]; + if(gid < 8) + resultScratch[gid] += resultScratch[gid + 8]; + if(gid < 4) + resultScratch[gid] += resultScratch[gid + 4]; + if(gid < 2) + resultScratch[gid] += resultScratch[gid + 2]; + if(gid < 1){ + resultScratch[gid] += resultScratch[gid + 1]; + loss[0] = resultScratch[gid]; + } +} +template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); +template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); + +template +__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){ + //printf("softmax_div\n"); + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num*dim; index += total){ + int n = index / dim; + data[index] /= scale[n]; + } +} + +template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); +template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl new file mode 100644 index 00000000..6d6e4f0b --- /dev/null +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -0,0 +1,65 @@ +template +__kernel void SoftmaxLossForwardGPU(const int nthreads, + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + if (has_ignore_label_ && label_value == ignore_label_) { + loss[index] = 0; + counts[index] = 0; + } else { + loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], + T(FLT_MIN))); + counts[index] = 1; + } + } +} + +template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); +template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); + +template +__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { + const int channels = dim / spatial_dim; + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < channels; ++c) { + bottom_diff[n * dim + c * spatial_dim + s] = 0; + } + counts[index] = 0; + } else { + bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; + counts[index] = 1; + } + } +} +template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); + +template __attribute__ ((mangled_name(softmax_loss_bp_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); From 858b0828b95273b99f598d4ac1379459b172a648 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 8 Aug 2015 08:12:32 +0800 Subject: [PATCH 030/124] Created global kernel map --- include/caffe/device.hpp | 7 +- include/caffe/neuron_layers.hpp | 18 ----- include/caffe/util/ocl_wrapper.hpp | 4 +- src/caffe/device.cpp | 122 +++++++++++++++-------------- src/caffe/layers/relu_layer.cpp | 20 +---- src/caffe/ocl/relu_layer.cl | 9 +-- src/caffe/util/ocl_wrapper.cpp | 21 +++-- 7 files changed, 92 insertions(+), 109 deletions(-) diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 0b534e57..cea343e8 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -23,7 +23,7 @@ class Device{ cl_device_id * pDevices; clblasOrder col; clblasOrder row; - + std::map Kernels; cl_int Init(); cl_int ConvertToString(std::string pFileName,std::string &Str); @@ -32,13 +32,14 @@ class Device{ void GetDeviceInfo(); - cl_program BuildProgram(std::string); + void BuildProgram(std::string kernel_dir); template void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str); template void appendBitfield(T info, T value, std::string name, std::string &str); - + + cl_kernel GetKernel(std::string kernel_name); }; extern char* buildOption; diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index bcb834de..9fe415f1 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -433,9 +433,7 @@ class ReLULayer : public NeuronLayer { */ explicit ReLULayer(const LayerParameter& param) : NeuronLayer(param) { - ocl_setup(); } - ~ReLULayer(); virtual inline const char* type() const { return "ReLU"; } protected: @@ -487,22 +485,6 @@ class ReLULayer : public NeuronLayer { const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - -//OpenCL related setup - void ocl_setup(); -//OpenCL wrapper - void ReLUForward_gpu(int count, const Dtype *bottom_data,Dtype *top_data, Dtype negative_slope) - { - ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope); - } - void ReLUBackward_gpu(int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff, Dtype negative_slope) - { - ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope); - } - protected: - cl_kernel ReLUForward_kernel; - cl_kernel ReLUBackward_kernel; - }; #ifdef USE_CUDNN diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 35ad695e..8d5a6a50 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -68,10 +68,10 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); template -void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); template -void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); +void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); template void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 3beba234..3f3fcf27 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -4,6 +4,8 @@ #include #include #include +#include + namespace caffe { //delete it after test, Yibing cl_mem test_alloc_mem[10]; @@ -11,6 +13,7 @@ extern long long unsigned device_mem_consumption; Device amdDevice; char* buildOption = "-x clc++ "; +std::string oclKernelPath="./src/caffe/ocl/"; Device::~Device(){ //clAmdBlasTeardown(); @@ -41,15 +44,6 @@ cl_int Device::Init(){ } platformName[nameLen] = 0; - //Get OpenCL Information - //res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_VERSION, 64, openclVersion, &nameLen); - //if(res != CL_SUCCESS) { - // fprintf(stderr, "Err: Get OpenCL Info failed!\n", res); - // return 0; - //} - //openclVersion[nameLen] = 0; - //printf("%s %s\n", platformName, openclVersion); - GetDeviceInfo(); cl_uint uiNumDevices; cl_bool unified_memory = false; @@ -124,57 +118,13 @@ cl_int Device::Init(){ fprintf(stderr,"Err: Failed to Create Commandqueue\n"); return 0; } + + + //BuildProgram from OpenCL kernel files + BuildProgram(oclKernelPath); - std::string strSource = ""; - - std::string pFileName[8]; - pFileName[0] = "./src/caffe/ocl/OCL_kernel.cl"; - pFileName[1] = "./src/caffe/ocl/lrn_layer.cl"; - pFileName[2] = "./src/caffe/ocl/pooling_layer.cl"; - pFileName[3] = "./src/caffe/ocl/dropout_layer.cl"; - pFileName[4] = "./src/caffe/ocl/relu_layer.cl"; - pFileName[5] = "./src/caffe/ocl/softmax_layer.cl"; - pFileName[6] = "./src/caffe/ocl/softmaxwithloss_layer.cl"; - pFileName[7] = "./src/caffe/ocl/im2col.cl"; - - for(int fileNum = 0; fileNum < 8; fileNum++) { - std::string tmpSource = ""; - ConvertToString(pFileName[fileNum], tmpSource); - strSource += tmpSource; - } - - const char *pSource; - pSource = strSource.c_str(); - size_t uiArrSourceSize[] = {0}; - uiArrSourceSize[0] = strlen(pSource); - Program = NULL; - Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL); - if(NULL == Program){ - fprintf(stderr,"Err: Failed to create program\n"); - } - - //Build Program - cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL); - LOG(INFO) << "Build Program"; - if(CL_SUCCESS != iStatus){ - fprintf(stderr,"Err: Failed to build program\n"); - char szBuildLog[16384]; - clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL); - std::cout << szBuildLog; - clReleaseProgram(Program); - } - - /* - //Setup AmdBlas; - cl_int err; - err = clAmdBlasSetup(); - if(err != CL_SUCCESS){ - printf("clAmdBlasSetup() failed with %d\n", err); - } - */ row = clblasRowMajor; col = clblasColumnMajor; - /* //delete after test the large buffer allocation, Yibing long long global_mem_size_limit = 1024*1024; //4*1024*1024*1024; @@ -218,6 +168,50 @@ cl_int Device::Init(){ return 0; } +void Device::BuildProgram(std::string kernel_dir) +{ + //Access opencl kernel files + std::string strSource = ""; + DIR *ocl_dir; + struct dirent *dirp; + if((ocl_dir=opendir(kernel_dir.c_str())) == NULL) + { + printf("Open ocl dir failed!\n"); + } + while((dirp = readdir(ocl_dir)) != NULL) + { + //Ignore hidden files + if(dirp->d_name[0] == '.') + continue; + std::string ocl_kernel_full_path=kernel_dir+std::string(dirp->d_name); + std::string tmpSource = ""; + ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); + strSource += tmpSource; + } + + const char *pSource; + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = {0}; + uiArrSourceSize[0] = strlen(pSource); + Program = NULL; + Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL); + if(NULL == Program){ + fprintf(stderr,"Err: Failed to create program\n"); + } + + //Build Program + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL); + LOG(INFO) << "Build Program"; + if(CL_SUCCESS != iStatus){ + fprintf(stderr,"Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram(Program); + } + + // return Program; +} //Use to read OpenCL source code cl_int Device::ConvertToString(std::string pFileName,std::string &Str){ @@ -247,6 +241,7 @@ cl_int Device::ConvertToString(std::string pFileName,std::string &Str){ return -1; } +/* cl_program Device::BuildProgram(std::string pFileName) { //Read our own kernel file @@ -275,6 +270,19 @@ cl_program Device::BuildProgram(std::string pFileName) } return program; } +*/ +cl_kernel Device::GetKernel(std::string kernel_name) +{ + std::map::iterator it = Kernels.find(kernel_name); + if(it == Kernels.end()) + { + cl_int _err=0; + cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err); + OCL_CHECK(_err); + Kernels[kernel_name] = kernel; + } + return Kernels[kernel_name]; +} void Device::DisplayPlatformInfo(){ cl_int err; diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 6ee3237a..c38814f1 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -5,22 +5,6 @@ #include "caffe/vision_layers.hpp" namespace caffe { -template -void ReLULayer::ocl_setup(){ - cl_program program = amdDevice.BuildProgram("src/caffe/layers/relu_layer.cl"); - cl_int _err=0; - ReLUForward_kernel = clCreateKernel(program,"ReLUForwardfloat",&_err); - ReLUBackward_kernel = clCreateKernel(program,"ReLUBackwardfloat",&_err); -} - -template -ReLULayer::~ReLULayer(){ - OCL_CHECK( clReleaseKernel(ReLUForward_kernel) ); - OCL_CHECK( clReleaseKernel(ReLUBackward_kernel) ); -} - - - template void ReLULayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { @@ -68,7 +52,7 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, // << " top_data: " << (unsigned long)top_data // << " blocks: " << CAFFE_GET_BLOCKS(count) // << " threads: " << CAFFE_CUDA_NUM_THREADS; - ReLUForward_gpu(count,bottom_data,top_data,negative_slope); + ReLUForward(count,bottom_data,top_data,negative_slope); } @@ -86,7 +70,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, // ReLUBackward<<>>( // count, top_diff, bottom_data, bottom_diff, negative_slope); // CUDA_POST_KERNEL_CHECK; - ReLUBackward_gpu(count,top_diff,bottom_data,bottom_diff,negative_slope); + ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope); } } diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index 0d8d3b4e..c9ba4900 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -5,9 +5,8 @@ __kernel void ReLUForward(const int count, __global T* in, __global T* out, T ne out[index] = in[index] > 0? in[index]:in[index]*negative_slope; } -//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); +template __attribute__ ((mangled_name(ReLUForwardFloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForwardDouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); template __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ @@ -16,5 +15,5 @@ __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_ out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; } -template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); -template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); +template __attribute__ ((mangled_name(ReLUBackwardFloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackwardDouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 7b57d329..f5f7e945 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -633,7 +633,11 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const fl template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); template -void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ + Dtype type; + std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double"; + std::string kernel_name = std::string("ReLUForward")+str_type; + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); @@ -645,11 +649,16 @@ void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dt OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void ReLUForward(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope); -template void ReLUForward(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope); +template void ReLUForward(const int count, const float* bottom_data, float* top_data, float negative_slope); +template void ReLUForward(const int count, const double* bottom_data, double* top_data, double negative_slope); template -void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ +void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ + Dtype type; + std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double"; + std::string kernel_name = std::string("ReLUBackward")+str_type; + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); @@ -662,8 +671,8 @@ void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, cons size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void ReLUBackward(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); -template void ReLUBackward(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); +template void ReLUBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); +template void ReLUBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); template void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { From 6934793436bcd0f6960d3a21e4830ea1ee5e09d5 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 9 Aug 2015 02:26:02 +0800 Subject: [PATCH 031/124] ocl wrappers get kernel from the map @amdDevice.Kernels instead of passing from outside; haven't finished in conv_layer --- include/caffe/common_layers.hpp | 6 - include/caffe/neuron_layers.hpp | 4 - include/caffe/util/ocl_util.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 48 ++--- include/caffe/vision_layers.hpp | 14 +- src/caffe/common.cpp | 4 - src/caffe/device.cpp | 3 +- src/caffe/layers/base_conv_layer.cpp | 2 +- src/caffe/layers/base_data_layer.cpp | 26 +-- src/caffe/layers/dropout_layer.cpp | 19 +- src/caffe/layers/pooling_layer.cpp | 44 +---- src/caffe/layers/power_layer.cpp | 28 +-- src/caffe/layers/softmax_layer.cpp | 30 +--- src/caffe/layers/softmax_loss_layer.cpp | 14 +- src/caffe/ocl/OCL_kernel.cl | 8 +- src/caffe/ocl/dropout_layer.cl | 8 +- src/caffe/ocl/pooling_layer.cl | 24 +-- src/caffe/ocl/relu_layer.cl | 8 +- src/caffe/ocl/softmaxwithloss_layer.cl | 8 +- src/caffe/solver.cpp | 8 +- src/caffe/util/ocl_util.cpp | 10 +- src/caffe/util/ocl_wrapper.cpp | 221 ++++++++++++++++-------- 22 files changed, 247 insertions(+), 292 deletions(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index a92bb4aa..eb77e762 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -483,7 +483,6 @@ class SoftmaxLayer : public Layer { public: explicit SoftmaxLayer(const LayerParameter& param) : Layer(param) { - ocl_setup(); } ~SoftmaxLayer(); virtual void Reshape(const vector*>& bottom, @@ -502,7 +501,6 @@ class SoftmaxLayer : public Layer { const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - virtual void ocl_setup(); int outer_num_; int inner_num_; @@ -511,10 +509,6 @@ class SoftmaxLayer : public Layer { Blob sum_multiplier_; /// scale is an intermediate Blob to hold temporary results. Blob scale_; - protected: - cl_kernel channel_max_kernel,channel_subtract_kernel,exp_kernel, channel_sum_kernel; - cl_kernel channel_div_kernel,channel_dot_kernel; - }; #ifdef USE_CUDNN diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 9fe415f1..cf6d645a 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -412,10 +412,6 @@ class PowerLayer : public NeuronLayer { Dtype shift_; /// @brief Result of @f$ \alpha \gamma @f$ Dtype diff_scale_; - - protected: - void ocl_setup(); - cl_kernel memset_kernel, scalar_kernel, div_kernel, mul_kernel, powx_kernel; }; /** diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 55695070..25747702 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -6,7 +6,7 @@ namespace caffe { template -void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count); +void ocl_memset(Dtype* buffer, const Dtype value, const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 8d5a6a50..71e13b2e 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -6,8 +6,8 @@ namespace caffe { typedef unsigned int uint32_t; -template -void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); +//template +//void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); template void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); @@ -38,28 +38,28 @@ template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data); template -void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask); +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask); template -void MaxPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); template -void AvePoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); template - void StoPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff); + void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data); template -void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data); +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data); template -void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data); +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data); template -void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data); +void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ); @@ -74,53 +74,53 @@ template void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); template -void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); +void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); +void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); template -void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); template -void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); template void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ); template -void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_max(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* out); template -void kernel_channel_subtract(cl_kernel Kernel, const int count, +void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_max, Dtype* data); template -void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out); +void kernel_exp(const int count, const Dtype* data, Dtype* out); template -void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* channel_sum); template -void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data); +void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data); template -void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_dot(const int num, const int channels, const int spatial_dim, const Dtype* data_1, const Dtype* data_2, Dtype* channel_dot); template -void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, const Dtype* label, Dtype* loss, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts); template -void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top, +void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, const Dtype* label, Dtype* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts); @@ -129,7 +129,7 @@ template void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y); template -void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data); +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); template void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in, @@ -149,10 +149,10 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, const int width, const int size, const Dtype negative_beta, const Dtype cache_ratio, Dtype* const bottom_diff); template -void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y); +void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y); template -void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y); +void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y); } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 336127d5..a1c9577d 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -121,7 +121,7 @@ class BaseConvolutionLayer : public Layer { } protected: inline void gpu_memset(Dtype* data, Dtype value, int count) { - ocl_memset(oclmem_kernel, data, value, count); + ocl_memset(data, value, count); } #endif @@ -445,12 +445,10 @@ class PoolingLayer : public Layer { public: explicit PoolingLayer(const LayerParameter& param) : Layer(param) {} - ~PoolingLayer(); virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); - void ocl_setup(); virtual inline const char* type() const { return "Pooling"; } virtual inline int ExactNumBottomBlobs() const { return 1; } @@ -482,16 +480,6 @@ class PoolingLayer : public Layer { Blob rand_idx_; Blob max_idx_; -//opencl related data structures -protected: - cl_kernel MaxPoolForward_kernel, - AvePoolForward_kernel, - StoPoolForwardTrain_kernel, - StoPoolForwardTest_kernel, - MaxPoolBackward_kernel, - AvePoolBackward_kernel, - StoPoolBackward_kernel; - }; #ifdef USE_CUDNN diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 407668c9..c4fe1195 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -112,10 +112,6 @@ Caffe::Caffe() if(err != CL_SUCCESS){ LOG(ERROR) << "clBLAS setup failed "<::backward_gpu_opt(const vector*>& t // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count()); + ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { caffe_gpu_gemv(CblasNoTrans, M_, N_, (Dtype)1., top_diff, top[i]->offset(n), N_, diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 1b6e07fa..71f5c132 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -60,13 +60,8 @@ template void BasePrefetchingDataLayer::Forward_cpu( const vector*>& bottom, const vector*>& top) { // First, join the thread - CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer; - join_prefetch_timer.Start(); JoinPrefetchThread(); - join_prefetch_timer.Stop(); - printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds()); - forward_timer.Start(); DLOG(INFO) << "Thread joined"; // Reshape to loaded data. top[0]->ReshapeLike(prefetch_data_); @@ -81,33 +76,24 @@ void BasePrefetchingDataLayer::Forward_cpu( caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), top[1]->mutable_cpu_data()); } - forward_timer.Stop(); - printf("write buffer time: %f\n", forward_timer.MilliSeconds()); // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; - create_prefetch_timer.Start(); CreatePrefetchThread(); - create_prefetch_timer.Stop(); - printf("create prefetch time: %f\n", create_prefetch_timer.MilliSeconds() ); } template void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { - CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer; - join_prefetch_timer.Start(); JoinPrefetchThread(); - join_prefetch_timer.Stop(); - printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds()); + DLOG(INFO) << "Thread joined"; // Copy the data from prefetch thread to data_layer //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); - clFinish(amdDevice.CommandQueue); - forward_timer.Start(); top[0]->ReshapeLike(this->prefetch_data_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); - if (this->output_labels_) { + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { // Reshape to loaded labels. top[1]->ReshapeLike(prefetch_label_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); @@ -115,10 +101,7 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo } // clFinish(amdDevice.CommandQueue); - forward_timer.Stop(); - printf("Write buffer time: %f\n\n", forward_timer.MilliSeconds()); - #ifdef Track_data_transfer #endif @@ -126,10 +109,7 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; - create_prefetch_timer.Start(); CreatePrefetchThread(); - create_prefetch_timer.Stop(); - printf("create_prefetch time: %f\n", create_prefetch_timer.MilliSeconds()); //return Dtype(0.); } diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 996098bc..dfd6560d 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -12,19 +12,12 @@ namespace caffe { template void DropoutLayer::ocl_setup(int bottom_count){ - //create OpenCL related cl_mem objects and kernels - ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat", NULL); - ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat", NULL); - rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat", NULL); MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL); } template DropoutLayer::~DropoutLayer(){ OCL_CHECK( clReleaseMemObject(MaskMem) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_Fwd) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_Bwd) ); - OCL_CHECK( clReleaseKernel(rng_kernel) ); } @@ -100,11 +93,11 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, static_cast(rand_vec_.mutable_cpu_data()); caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); - DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else // caffe_gpu_rng_uniform(count, mask); - caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); - DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); + DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #endif // set thresholds // NOLINT_NEXT_LINE(whitespace/operators) @@ -112,9 +105,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, // count, bottom_data, mask, uint_thres_, scale_, top_data); // CUDA_POST_KERNEL_CHECK; } else { - //caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data); - if(bottom_data != top_data) - OCL_CHECK( clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)bottom_data, (cl_mem)top_data, 0, 0, count*sizeof(Dtype), 0, NULL, NULL) ); + caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data); } } @@ -135,7 +126,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, // CAFFE_CUDA_NUM_THREADS>>>( // count, top_diff, mask, uint_thres_, scale_, bottom_diff); // CUDA_POST_KERNEL_CHECK; - DropoutBackward(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); + DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); } else { caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); } diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 3c94f0de..83b18c89 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -13,17 +13,6 @@ namespace caffe { using std::min; using std::max; -template -PoolingLayer::~PoolingLayer(){ - OCL_CHECK( clReleaseKernel(MaxPoolForward_kernel) ); - OCL_CHECK( clReleaseKernel(AvePoolForward_kernel) ); - OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) ); - OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) ); - OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) ); - OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) ); - OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) ); -} - template void PoolingLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { @@ -87,20 +76,8 @@ void PoolingLayer::LayerSetUp(const vector*>& bottom, CHECK_LT(pad_h_, kernel_h_); CHECK_LT(pad_w_, kernel_w_); } - //Intialize OpenCL related - ocl_setup(); } -template - void PoolingLayer::ocl_setup(){ - MaxPoolForward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolForwardfloat", NULL); - AvePoolForward_kernel = clCreateKernel(amdDevice.Program, "AvePoolForwardfloat", NULL); - StoPoolForwardTrain_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTrainfloat", NULL); - StoPoolForwardTest_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTestfloat", NULL); - MaxPoolBackward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolBackwardfloat", NULL); - AvePoolBackward_kernel = clCreateKernel(amdDevice.Program, "AvePoolBackwardfloat", NULL); - StoPoolBackward_kernel = clCreateKernel(amdDevice.Program, "StoPoolBackwardfloat", NULL); -} template void PoolingLayer::Reshape(const vector*>& bottom, @@ -352,8 +329,7 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, mask = max_idx_.mutable_gpu_data(); } // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward(MaxPoolForward_kernel, - count, bottom_data, bottom[0]->num(), channels_, + MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, mask, top_mask); @@ -367,8 +343,7 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, break; case PoolingParameter_PoolMethod_AVE: // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward(AvePoolForward_kernel, - count, bottom_data, bottom[0]->num(), channels_, + AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); /* @@ -384,15 +359,13 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), rand_idx_.mutable_gpu_data()); // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain(StoPoolForwardTrain_kernel, - count, bottom_data, bottom[0]->num(), channels_, + StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, rand_idx_.mutable_gpu_data(), top_data); } else { // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest(StoPoolForwardTest_kernel, - count, bottom_data, bottom[0]->num(), channels_, + StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, top_data); } @@ -425,23 +398,20 @@ void PoolingLayer::Backward_gpu(const vector*>& top, mask = max_idx_.gpu_data(); } // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward(MaxPoolBackward_kernel, - count, top_diff, mask, top_mask, top[0]->num(), channels_, + MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); break; case PoolingParameter_PoolMethod_AVE: // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward(AvePoolBackward_kernel, - count, top_diff, top[0]->num(), channels_, + AvePoolBackward(count, top_diff, top[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); break; case PoolingParameter_PoolMethod_STOCHASTIC: // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward(StoPoolBackward_kernel, - count, rand_idx_.gpu_data(), top_diff, + StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, top[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, bottom_diff); diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 94393f73..d3c374f1 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -18,18 +18,8 @@ void PowerLayer::LayerSetUp(const vector*>& bottom, scale_ = this->layer_param_.power_param().scale(); shift_ = this->layer_param_.power_param().shift(); diff_scale_ = power_ * scale_; - //OpenCL related set up - ocl_setup(); } -template -void PowerLayer::ocl_setup(){ - memset_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); - scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); - div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); - powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); - mul_kernel = clCreateKernel(amdDevice.Program, "element_mul_float", NULL); -} // Compute y = (shift + scale * x)^power template @@ -116,7 +106,7 @@ void PowerLayer::Forward_gpu(const vector*>& bottom, // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - ocl_memset(memset_kernel, top_data, value, count); + ocl_memset(top_data, value, count); return; } const Dtype* bottom_data = bottom[0]->gpu_data(); @@ -125,10 +115,10 @@ void PowerLayer::Forward_gpu(const vector*>& bottom, caffe_gpu_scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(scalar_kernel, count, shift_, top_data); + caffe_gpu_add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { - caffe_gpu_powx(powx_kernel, count, top_data, power_, top_data); + caffe_gpu_powx(count, top_data, power_, top_data); } } @@ -140,7 +130,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, const int count = bottom[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - ocl_memset(memset_kernel, bottom_diff, diff_scale_,count); + ocl_memset(bottom_diff, diff_scale_,count); } else { const Dtype* bottom_data = bottom[0]->gpu_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) @@ -152,7 +142,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(scalar_kernel, count, diff_scale_ * shift_, bottom_diff); + caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power @@ -160,7 +150,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(div_kernel, count, top_data, bottom_data, bottom_diff); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); caffe_gpu_scal(count, power_, bottom_diff); } else { caffe_gpu_copy(count, bottom_data, bottom_diff); @@ -168,16 +158,16 @@ void PowerLayer::Backward_gpu(const vector*>& top, caffe_gpu_scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff); + caffe_gpu_add_scalar(count, shift_, bottom_diff); } const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff); + caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); if (diff_scale_ != Dtype(1)) { caffe_gpu_scal(count, diff_scale_, bottom_diff); } } } - caffe_gpu_mul(mul_kernel, count, top_diff, bottom_diff, bottom_diff); + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } } diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 07c2fcfc..27c18b7b 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -24,25 +24,9 @@ void SoftmaxLayer::Reshape(const vector*>& bottom, scale_.Reshape(scale_dims); } -template -void SoftmaxLayer::ocl_setup(){ - cl_int err = 0; - channel_max_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_max_float", &err); - channel_subtract_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_subtract_float", &err);; - exp_kernel = clCreateKernel(amdDevice.Program, "kernel_exp_float", &err);; - channel_sum_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_sum_float", &err);; - channel_div_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_div_float", &err);; - channel_dot_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_dot_float", &err);; -} template SoftmaxLayer::~SoftmaxLayer(){ - clReleaseKernel(channel_max_kernel); - clReleaseKernel(channel_subtract_kernel); - clReleaseKernel(exp_kernel); - clReleaseKernel(channel_sum_kernel); - clReleaseKernel(channel_div_kernel); - clReleaseKernel(channel_dot_kernel); } template @@ -122,22 +106,22 @@ void SoftmaxLayer::Forward_gpu(const vector*>& bottom, // compute max // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_max(channel_max_kernel, outer_num_, channels, inner_num_, top_data, + kernel_channel_max(outer_num_, channels, inner_num_, top_data, scale_data); // subtract // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract(channel_subtract_kernel, count, outer_num_, channels, inner_num_, + kernel_channel_subtract(count, outer_num_, channels, inner_num_, scale_data, top_data); // exponentiate // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp(exp_kernel, count, top_data, top_data); + kernel_exp(count, top_data, top_data); // sum after exp // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum(channel_sum_kernel, outer_num_, channels, inner_num_, top_data, + kernel_channel_sum(outer_num_, channels, inner_num_, top_data, scale_data); // divide // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div(channel_div_kernel, count, outer_num_, channels, inner_num_, + kernel_channel_div(count, outer_num_, channels, inner_num_, scale_data, top_data); } @@ -154,10 +138,10 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_dot(channel_dot_kernel, outer_num_, channels, inner_num_, + kernel_channel_dot(outer_num_, channels, inner_num_, top_diff, top_data, scale_data); // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract(channel_subtract_kernel, count, outer_num_, channels, inner_num_, + kernel_channel_subtract(count, outer_num_, channels, inner_num_, scale_data, bottom_diff); // elementwise multiplication caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 4b091d3a..a3cca01c 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -34,22 +34,12 @@ void SoftmaxWithLossLayer::LayerSetUp( template void SoftmaxWithLossLayer::ocl_setup(){ - cl_int err=0; - scal_kernel = clCreateKernel(amdDevice.Program, "scal_float", &err); - diff_kernel = clCreateKernel(amdDevice.Program, "diff_float", &err); - softmax_kernel = clCreateKernel(amdDevice.Program, "softmax_float", &err); d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL); - softmax_loss_fp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_fp_float", &err); - softmax_loss_bp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_bp_float", &err); } template SoftmaxWithLossLayer::~SoftmaxWithLossLayer(){ - clReleaseKernel(diff_kernel); - clReleaseKernel(scal_kernel); - clReleaseKernel(softmax_loss_fp_kernel); - clReleaseKernel(softmax_loss_bp_kernel); } template @@ -158,7 +148,7 @@ void SoftmaxWithLossLayer::Forward_gpu( // to avoid having to allocate additional GPU memory. Dtype* counts = prob_.mutable_gpu_diff(); // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU(softmax_loss_fp_kernel, nthreads, prob_data, label, loss_data, + SoftmaxLossForwardGPU( nthreads, prob_data, label, loss_data, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); Dtype loss; caffe_gpu_asum(nthreads, loss_data, &loss); @@ -195,7 +185,7 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, // we use to to avoid allocating new GPU memory. Dtype* counts = prob_.mutable_gpu_diff(); // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU(softmax_loss_bp_kernel, nthreads, top_data, label, bottom_diff, + SoftmaxLossBackwardGPU(nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); const Dtype loss_weight = top[0]->cpu_diff()[0]; if (normalize_) { diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl index 7014721b..bc5eabff 100644 --- a/src/caffe/ocl/OCL_kernel.cl +++ b/src/caffe/ocl/OCL_kernel.cl @@ -718,9 +718,9 @@ __kernel void PRNG_threefry4x32( } -template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); +template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); -template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); +template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); //end of the looooooong gpu_random_generator kernel @@ -733,8 +733,8 @@ __kernel void OCL_memset(__global T* buffer, const T value, const int size){ } } -template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); -template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); +template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); +template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); __kernel void OCL_memset2(__global int* buffer, const int value, const int size){ int gdx = get_global_id(0); diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl index 3b1c479b..8d3db447 100644 --- a/src/caffe/ocl/dropout_layer.cl +++ b/src/caffe/ocl/dropout_layer.cl @@ -4,8 +4,8 @@ __kernel void DropoutForward(const int n, __global T *in, __global const int* ma if (index < n) out[index] = in[index] * scale * mask[index]; } -template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); -template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); +template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); template @@ -14,5 +14,5 @@ __kernel void DropoutBackward(const int n, __global T *in_diff, __global const i if (index < n) out_diff[index] = in_diff[index] * scale * mask[index]; } -template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); -template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); +template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 5ac4bd52..80289b68 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -33,8 +33,8 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const } } } -template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); -template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); +template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); +template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); template __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){ @@ -103,8 +103,8 @@ __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, c } } } -template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); +template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); template __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ @@ -132,8 +132,8 @@ __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const } top_data[index] = cumvalues / cumsum; } } -template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); -template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); +template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); template __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, @@ -182,8 +182,8 @@ __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, bottom_diff[index] = gradient; } } -template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); -template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){ @@ -215,8 +215,8 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in } } -template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); -template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); +template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template void StoPoolBackward(const int nthreads, @@ -253,13 +253,13 @@ void StoPoolBackward(const int nthreads, } } -template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel void StoPoolBackward(const int nthreads, +template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, __global float* rand_idx, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* bottom_diff); -template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward(const int nthreads, +template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, __global double* rand_idx, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index c9ba4900..df26d66e 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -5,8 +5,8 @@ __kernel void ReLUForward(const int count, __global T* in, __global T* out, T ne out[index] = in[index] > 0? in[index]:in[index]*negative_slope; } -template __attribute__ ((mangled_name(ReLUForwardFloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwardDouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); +template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); template __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ @@ -15,5 +15,5 @@ __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_ out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; } -template __attribute__ ((mangled_name(ReLUBackwardFloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); -template __attribute__ ((mangled_name(ReLUBackwardDouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); +template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackward_double))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index 6d6e4f0b..97eb6874 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -20,12 +20,12 @@ __kernel void SoftmaxLossForwardGPU(const int nthreads, } } -template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, +template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, __global float* prob_data, __global float* label,__global float* loss, int num, int dim, int spatial_dim, bool has_ignore_label_, int ignore_label_, __global float* counts); -template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, +template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, __global double* prob_data, __global double* label,__global double* loss, int num, int dim, int spatial_dim, bool has_ignore_label_, int ignore_label_, @@ -54,12 +54,12 @@ __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, } } } -template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, +template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, __global float* label,__global float* bottom_diff, int num, int dim, int spatial_dim, bool has_ignore_label_, int ignore_label_, float* counts); -template __attribute__ ((mangled_name(softmax_loss_bp_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, +template __attribute__ ((mangled_name(SoftmaxLossBackward_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, __global double* label,__global double* bottom_diff, int num, int dim, int spatial_dim, bool has_ignore_label_, int ignore_label_, double* counts); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 715297a6..63c8294c 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -766,7 +766,7 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::GPU: { #ifndef CPU_ONLY // compute square of gradient in update - caffe_gpu_powx(powx_kernel, net_params[param_id]->count(), + caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); @@ -777,14 +777,14 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->history_[param_id]->mutable_gpu_data()); // prepare update - caffe_gpu_powx(powx_kernel, net_params[param_id]->count(), + caffe_gpu_powx( net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(), + caffe_gpu_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_div(div_kernel, net_params[param_id]->count(), + caffe_gpu_div(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index eef9f544..044f9e69 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -8,10 +8,12 @@ #include "caffe/common.hpp" #include "caffe/util/ocl_util.hpp" namespace caffe { - +template extern std::string get_dtype_suffix(); template -void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){ +void ocl_memset(Dtype* buffer, const Dtype value, const int count){ + std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int err=0; err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value); @@ -25,8 +27,8 @@ void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int co } // Explicit instantiation -template void ocl_memset(cl_kernel Kernel, float* buffer, const float value, const int count); -template void ocl_memset(cl_kernel Kernel, double* buffer, const double value, const int count); +template void ocl_memset(float* buffer, const float value, const int count); +template void ocl_memset(double* buffer, const double value, const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){ diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index f5f7e945..a9563c14 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -11,8 +11,27 @@ namespace caffe { typedef unsigned int uint32_t; struct array4x32 { uint32_t v[4]; }; + +template std::string get_dtype_suffix() +{ + dtype x; + const char type = typeid(x).name()[0]; + std::string suffix; + switch(type){ + case 'i': suffix = "_int"; break; + case 'd': suffix = "_double"; break; + case 'f': + default: suffix = "_float"; + } + return suffix; +} + template -void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){ +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold) +{ + std::string kernel_name = std::string("RNGBernoulli") + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + static unsigned c = 0; unsigned nrounds = 20; array4x32 rndctr4; @@ -33,8 +52,8 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype size_t localws[1] = {256}; OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); } -template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold); -template void caffe_gpu_bernoulli(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold); +template void caffe_gpu_bernoulli(int* a, const unsigned int n, float inf, float sup, float threshold); +template void caffe_gpu_bernoulli(int* a, const unsigned int n, double inf, double sup, double threshold); template @@ -134,9 +153,12 @@ template float softmax_gpu(cl_kernel Kernel, const int num, const int dim template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); template -void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_max(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* out) { + std::string kernel_name = std::string("kernel_channel_max") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); @@ -148,15 +170,19 @@ void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, +template void kernel_channel_max( const int num, const int channels, const int spatial_dim, const float* data, float* out); -template void kernel_channel_max(cl_kernel Kernel, const int num, const int channels, +template void kernel_channel_max( const int num, const int channels, const int spatial_dim, const double* data, double* out); template -void kernel_channel_subtract(cl_kernel Kernel, const int count, +void kernel_channel_subtract( const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data){ + const int spatial_dim, const Dtype* channel_max, Dtype* data) +{ + std::string kernel_name = std::string("kernel_channel_subtract") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); @@ -169,16 +195,19 @@ void kernel_channel_subtract(cl_kernel Kernel, const int count, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_channel_subtract(cl_kernel Kernel, const int count, +template void kernel_channel_subtract( const int count, const int num, const int channels, const int spatial_dim, const float* channel_max, float* data); -template void kernel_channel_subtract(cl_kernel Kernel, const int count, +template void kernel_channel_subtract( const int count, const int num, const int channels, const int spatial_dim, const double* channel_max, double* data); template -void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out) +void kernel_exp(const int count, const Dtype* data, Dtype* out) { + std::string kernel_name = std::string("kernel_exp") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); @@ -188,13 +217,16 @@ void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_exp(cl_kernel Kernel, const int count, const float* data, float* out); -template void kernel_exp(cl_kernel Kernel, const int count, const double* data, double* out); +template void kernel_exp(const int count, const float* data, float* out); +template void kernel_exp(const int count, const double* data, double* out); template -void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* channel_sum) { + std::string kernel_name = std::string("kernel_channel_sum") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); @@ -206,13 +238,16 @@ void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum); -template void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum); +template void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum); +template void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum); template -void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, +void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data) { + std::string kernel_name = std::string("kernel_channel_div") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); @@ -225,16 +260,19 @@ void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, +template void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const float* channel_sum, float* data); -template void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, +template void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const double* channel_sum, double* data); template -void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, +void kernel_channel_dot(const int num, const int channels, const int spatial_dim, const Dtype* data_1, const Dtype* data_2, Dtype* channel_dot) { + std::string kernel_name = std::string("kernel_channel_dot") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); @@ -247,19 +285,22 @@ void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -template void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, +template void kernel_channel_dot(const int num, const int channels, const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot); -template void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels, +template void kernel_channel_dot(const int num, const int channels, const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot); template -void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, const Dtype* label, Dtype* loss, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts) { + std::string kernel_name = std::string("SoftmaxLossForwardGPU") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&prob_data)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); @@ -276,17 +317,20 @@ void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, const float* prob_data, const float* label, float* loss, +template void SoftmaxLossForwardGPU(const int nthreads, const float* prob_data, const float* label, float* loss, const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts); -template void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads, const double* prob_data, const double* label, double* loss, +template void SoftmaxLossForwardGPU(const int nthreads, const double* prob_data, const double* label, double* loss, const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts); template -void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top, +void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, const Dtype* label, Dtype* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts) { + std::string kernel_name = std::string("SoftmaxLossBackwardGPU") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); @@ -303,9 +347,9 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const float* top, const float* label, float* bottom_diff, +template void SoftmaxLossBackwardGPU(const int nthreads, const float* top, const float* label, float* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts); -template void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const double* top, const double* label, double* bottom_diff, +template void SoftmaxLossBackwardGPU(const int nthreads, const double* top, const double* label, double* bottom_diff, const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts); template @@ -364,8 +408,11 @@ template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const f template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data); template -void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){ - cl_int ret; +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){ + std::string kernel_name = std::string("MaxPoolForward") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); @@ -390,11 +437,14 @@ void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void MaxPoolForward(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask); -template void MaxPoolForward(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask); +template void MaxPoolForward(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask); +template void MaxPoolForward(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask); template -void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data){ +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data) +{ + std::string kernel_name = std::string("StoPoolForwardTrain") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); @@ -416,11 +466,14 @@ void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_d size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void StoPoolForwardTrain(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data); -template void StoPoolForwardTrain(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data); +template void StoPoolForwardTrain(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data); +template void StoPoolForwardTrain(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data); template -void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){ +void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){ + std::string kernel_name = std::string("StoPoolForwardTest") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); @@ -442,11 +495,13 @@ void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_da OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void StoPoolForwardTest(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data); -template void StoPoolForwardTest(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data); +template void StoPoolForwardTest(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data); +template void StoPoolForwardTest(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data); template -void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){ +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){ + std::string kernel_name = std::string("AvePoolForward") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); @@ -469,8 +524,8 @@ void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void AvePoolForward(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data); -template void AvePoolForward(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data); +template void AvePoolForward(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data); +template void AvePoolForward(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){ @@ -524,7 +579,9 @@ template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const fl template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff ); template -void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){ +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){ + std::string kernel_name = std::string("MaxPoolBackward") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); @@ -550,12 +607,15 @@ void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const to OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void MaxPoolBackward(cl_kernel kernel, const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); -template void MaxPoolBackward(cl_kernel kernel, const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); +template void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); +template void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); template -void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff) +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = std::string("AvePoolBackward") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); @@ -578,11 +638,13 @@ void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const to size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void AvePoolBackward(cl_kernel kernel, const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); -template void AvePoolBackward(cl_kernel kernel, const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); +template void AvePoolBackward(const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); +template void AvePoolBackward(const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); template -void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){ +void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){ + std::string kernel_name = std::string("StoPoolBackward") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx); @@ -604,8 +666,8 @@ void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const ra size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void StoPoolBackward(cl_kernel kernel, const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff); -template void StoPoolBackward(cl_kernel kernel, const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff); +template void StoPoolBackward(const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff); +template void StoPoolBackward(const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){ @@ -634,9 +696,7 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const d template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ - Dtype type; - std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double"; - std::string kernel_name = std::string("ReLUForward")+str_type; + std::string kernel_name = std::string("ReLUForward") + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); @@ -654,9 +714,7 @@ template void ReLUForward(const int count, const double* bottom_data, do template void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ - Dtype type; - std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double"; - std::string kernel_name = std::string("ReLUBackward")+str_type; + std::string kernel_name = std::string("ReLUBackward") + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -824,7 +882,9 @@ template void caffe_gpu_sign(cl_kernel Kernel,const int N, const float* template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double* X, double* Y ); template -void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ +void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ + std::string kernel_name = std::string("div") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); @@ -836,11 +896,13 @@ void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_div (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); -template void caffe_gpu_div (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); +template void caffe_gpu_div (const int n, const float* a, const float* b, float* y); +template void caffe_gpu_div (const int n, const double* a, const double* b, double* y); template -void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){ +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ + std::string kernel_name = std::string("add_scalar") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha); @@ -851,11 +913,14 @@ void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtyp OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const float alpha, float* top_data); -template void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const double alpha, double* top_data); +template void caffe_gpu_add_scalar (const int n, const float alpha, float* top_data); +template void caffe_gpu_add_scalar (const int n, const double alpha, double* top_data); template -void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){ +void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ + std::string kernel_name = std::string("element_mul") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); @@ -867,11 +932,13 @@ void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_mul (cl_kernel Kernel, const int n, const float* a, const float* b, float* y); -template void caffe_gpu_mul (cl_kernel Kernel, const int n, const double* a, const double* b, double* y); +template void caffe_gpu_mul (const int n, const float* a, const float* b, float* y); +template void caffe_gpu_mul (const int n, const double* a, const double* b, double* y); template -void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){ +void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ + std::string kernel_name = std::string("powx") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); @@ -883,12 +950,15 @@ void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_powx (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y); -template void caffe_gpu_powx (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y); +template void caffe_gpu_powx (const int n, const float* a, const float alpha, float* y); +template void caffe_gpu_powx (const int n, const double* a, const double alpha, double* y); template -void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) +void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) { + std::string kernel_name = std::string("DropoutForward") + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); @@ -902,12 +972,15 @@ void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void DropoutForward(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); -template void DropoutForward(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); +template void DropoutForward(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); +template void DropoutForward(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); template -void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) { + std::string kernel_name = std::string("DropoutBackward") + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); @@ -921,7 +994,7 @@ void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, c size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void DropoutBackward(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); -template void DropoutBackward(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); +template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); +template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); } // namespace caffe From b904fcfbe9a638058263f0cd10538338f943ca02 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 9 Aug 2015 17:09:01 +0800 Subject: [PATCH 032/124] clean the ocl wrappers in conv_layer; check the type of files to be built in ./src/caffe/ocl --- include/caffe/util/im2col.hpp | 8 ++-- include/caffe/util/ocl_wrapper.hpp | 4 +- include/caffe/vision_layers.hpp | 16 +++---- src/caffe/device.cpp | 10 +++-- src/caffe/layers/base_conv_layer.cpp | 16 ++++--- src/caffe/ocl/OCL_kernel.cl | 8 ++-- src/caffe/ocl/im2col.cl | 20 ++++----- src/caffe/util/im2col.cpp | 45 ++++++++++++------- src/caffe/util/ocl_wrapper.cpp | 66 +++++++++++++++------------- 9 files changed, 108 insertions(+), 85 deletions(-) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 5eb28f9a..aec9e330 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -16,7 +16,7 @@ void col2im_cpu(const Dtype* data_col, const int channels, const int stride_w, Dtype* data_im); template -void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, +void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, @@ -24,7 +24,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, Dtype* data_im, const int img_offset); template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, @@ -53,7 +53,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int stride, Dtype* data_col, const int col_offset); template -void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col, const int col_offset, int optnum); @@ -63,7 +63,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c const int stride, Dtype* data_im, const int img_offset); template -void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_im, const int img_offset, int optnum); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 71e13b2e..dbe2eb49 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -10,10 +10,10 @@ typedef unsigned int uint32_t; //void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); template -void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); template -void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, +void opttrans(const Dtype* data_im, const int im_offset, const int channels, const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum); template diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index a1c9577d..237e9cbf 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -98,26 +98,26 @@ class BaseConvolutionLayer : public Layer { } #ifndef CPU_ONLY inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { - im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, + im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0); } inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); } inline void conv_im2col_gpu_opt(const Dtype* data) { - im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, + im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); } inline void conv_col2im_gpu_opt( Dtype* data) { - col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, + col2im_gpu_opt((Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); } inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { - transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); + transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); } inline void conv_transpose_gpu(const Dtype* data){ - opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); + opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); } protected: inline void gpu_memset(Dtype* data, Dtype value, int count) { @@ -142,10 +142,6 @@ class BaseConvolutionLayer : public Layer { const vector*>& top, bool skip_im2col = false) ; void backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - cl_kernel im2col_gpu_kernel, col2im_gpu_kernel; - cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel; - cl_kernel oclmem_kernel; - cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform; int opt_num2; int M_, N_, K_; int weight_offset_; diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index c10ddf25..23c3789b 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -182,9 +182,13 @@ void Device::BuildProgram(std::string kernel_dir) while((dirp = readdir(ocl_dir)) != NULL) { //Ignore hidden files - if(dirp->d_name[0] == '.') - continue; - std::string ocl_kernel_full_path=kernel_dir+std::string(dirp->d_name); + if(dirp->d_name[0] == '.') continue; + std::string file_name = std::string(dirp->d_name); + //Skip non *.cl files + size_t last_dot_pos = file_name.find_last_of("."); + if(file_name.substr(last_dot_pos+1) != "cl") continue; + + std::string ocl_kernel_full_path=kernel_dir+file_name; std::string tmpSource = ""; ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); strSource += tmpSource; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index ea4a1658..1c1379b3 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -33,7 +33,7 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) template void BaseConvolutionLayer::ocl_setup() { - im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL); +/* im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL); col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL); oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); @@ -41,7 +41,7 @@ void BaseConvolutionLayer::ocl_setup() { opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL); ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL); ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL); - +*/ M_ = conv_out_channels_ / group_; K_ = kernel_dim_ / group_; N_ = conv_out_spatial_dim_; @@ -56,6 +56,7 @@ void BaseConvolutionLayer::ocl_setup() { template BaseConvolutionLayer::~BaseConvolutionLayer(){ + /* OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) ); OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) ); OCL_CHECK( clReleaseKernel(oclmem_kernel) ); @@ -63,6 +64,7 @@ template OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); OCL_CHECK( clReleaseKernel(im2col_opt_kernel) ); OCL_CHECK( clReleaseKernel(col2im_opt_kernel) ); +*/ } @@ -495,7 +497,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo col_offset = K_ * N_ * opt_num2; //step1: packed im2col, col_size = (K_ * group_ ) * N_ //this should be opt_num2 images packing together. - im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); //step 2: sgemm: Top (subTopMem) = weight * col_data @@ -520,7 +522,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo } #endif //step 3: tranform - transform_gpu(ocl_Kernel_transform, (Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); + transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); //step 4: add bias /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/ @@ -578,13 +580,13 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t col_offset = K_ * (N_ * opt_num2); //step1: packed im2col, col_size = (K_ * group_ ) * N_ //this should be opt_num2 images packing together. - im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_, + im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize int height_top = M_ * group_, width_top = N_; //if (opt_num2 >1) - opttrans(opttrans_kernel, top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); + opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); //step 3: sgemm: Top (subTopMem) = weight * col_data for(g = 0; g < group_; ++g) { @@ -624,7 +626,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t #endif //step5: col2im - col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); #ifdef Track_layer LOG(WARNING) << "conv bp done"; diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl index bc5eabff..a014a5cf 100644 --- a/src/caffe/ocl/OCL_kernel.cl +++ b/src/caffe/ocl/OCL_kernel.cl @@ -981,8 +981,8 @@ __kernel void transpose(__global const T *src, __global T* dst, int width, int h if( gidx < width && gidyy < height * optnum ) dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; } -template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); -template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); +template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); template __kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ @@ -995,5 +995,5 @@ __kernel void transform(__global const T *src, __global T* dst, int top_offset, for(i = 0 ; i < width; i++) dst[(index * height + offset)* width + i] = src[gidx * width + i]; } -template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); -template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 577dd58f..728f8dd3 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -63,8 +63,8 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c } } -template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); -template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); template @@ -102,11 +102,11 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in } } -template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel(const int n, __global const float* data_im, +template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2col_gpu_double_kernel))) void im2col_gpu_kernel(const int n, __global const double* data_im, +template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset); @@ -146,12 +146,12 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i } } -template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, +template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w,const int pad_h, const int pad_w, const int stride_h, const int stride_w,const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, +template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); @@ -245,8 +245,8 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset data_im[index] = val; } } -template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); -template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); template @@ -294,5 +294,5 @@ __kernel void opttrans(const int n, __global T* data_im, const int im_offset, co data_opt[opt_index] = data_im[index]; } } -template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); -template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 4d28ab1e..a5eb4176 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -8,6 +8,8 @@ namespace caffe { +template extern std::string get_dtype_suffix(); + template void im2col_cpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, @@ -83,9 +85,11 @@ template void col2im_cpu(const double* data_col, const int channels, template -void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_im, const int img_offset, int optnum){ + std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; int num_kernels = channels * height * width; @@ -112,21 +116,24 @@ void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offse OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void col2im_gpu_opt(cl_kernel kernel, const float* data_col, const int col_offset, const int channels, +template void col2im_gpu_opt(const float* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, float* data_im, const int img_offset, int optnum); -template void col2im_gpu_opt(cl_kernel kernel, const double* data_col, const int col_offset, const int channels, +template void col2im_gpu_opt(const double* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, double* data_im, const int img_offset, int optnum); //cannot use now, need to modify kernel. template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset) { + std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = channels * height_col * width_col; @@ -156,24 +163,27 @@ void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, c } -template void im2col_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, +template void im2col_gpu(const float* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col, const int col_offset); -template void im2col_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, +template void im2col_gpu(const double* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col, const int col_offset); //cannot use now, need to modify kernel template -void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, +void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; @@ -201,11 +211,11 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void col2im_gpu(cl_kernel Kernel, const float* data_col, const int col_offset, +template void col2im_gpu(const float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_im, const int img_offset); -template void col2im_gpu(cl_kernel Kernel, const double* data_col, const int col_offset, +template void col2im_gpu(const double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w,const int stride_h, const int stride_w, @@ -285,10 +295,13 @@ template void im2col_16_gpu(cl_kernel Kernel, const double* data_im, con const int stride, double* data_col, const int col_offset); template -void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col, const int col_offset, int optnum) { + std::string kernel_name = "im2col_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; int num_kernels = optnum * channels * height_col * width_col; @@ -315,17 +328,19 @@ void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void im2col_gpu_opt(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, +template void im2col_gpu_opt(const float* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, float* data_col, const int col_offset, int optnum); -template void im2col_gpu_opt(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, +template void im2col_gpu_opt(const double* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, double* data_col, const int col_offset, int optnum); template -void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, +void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; @@ -356,10 +371,10 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c } -template void col2im_gpu(cl_kernel Kernel, const float* data_col, const int col_offset, const int channels, +template void col2im_gpu(const float* data_col, const int col_offset, const int channels, const int height, const int width, const int psize, const int pad, const int stride, float* data_im, const int img_offset); -template void col2im_gpu(cl_kernel Kernel, const double* data_col, const int col_offset, const int channels, +template void col2im_gpu(const double* data_col, const int col_offset, const int channels, const int height, const int width, const int psize, const int pad, const int stride, double* data_im, const int img_offset); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index a9563c14..ac1d9958 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -29,7 +29,7 @@ template std::string get_dtype_suffix() template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold) { - std::string kernel_name = std::string("RNGBernoulli") + get_dtype_suffix(); + std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); static unsigned c = 0; @@ -57,7 +57,10 @@ template void caffe_gpu_bernoulli(int* a, const unsigned int n, double i template -void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ + std::string kernel_name = "transform" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src); OCL_CHECK(ret); @@ -77,8 +80,8 @@ void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offse OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) ); } -template void transform_gpu(cl_kernel Kernel, float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num); -template void transform_gpu(cl_kernel Kernel, double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num); +template void transform_gpu(float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num); +template void transform_gpu(double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num); template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){ @@ -156,7 +159,7 @@ template void kernel_channel_max(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* out) { - std::string kernel_name = std::string("kernel_channel_max") + get_dtype_suffix(); + std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); @@ -180,7 +183,7 @@ void kernel_channel_subtract( const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_max, Dtype* data) { - std::string kernel_name = std::string("kernel_channel_subtract") + get_dtype_suffix(); + std::string kernel_name = "kernel_channel_subtract" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); @@ -205,7 +208,7 @@ template void kernel_channel_subtract( const int count, template void kernel_exp(const int count, const Dtype* data, Dtype* out) { - std::string kernel_name = std::string("kernel_exp") + get_dtype_suffix(); + std::string kernel_name = "kernel_exp" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); @@ -224,7 +227,7 @@ template void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* channel_sum) { - std::string kernel_name = std::string("kernel_channel_sum") + get_dtype_suffix(); + std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); @@ -245,7 +248,7 @@ template void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data) { - std::string kernel_name = std::string("kernel_channel_div") + get_dtype_suffix(); + std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); @@ -270,7 +273,7 @@ void kernel_channel_dot(const int num, const int channels, const int spatial_dim, const Dtype* data_1, const Dtype* data_2, Dtype* channel_dot) { - std::string kernel_name = std::string("kernel_channel_dot") + get_dtype_suffix(); + std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); @@ -298,7 +301,7 @@ void SoftmaxLossForwardGPU(const int nthreads, const bool has_ignore_label_, const int ignore_label_, Dtype* counts) { - std::string kernel_name = std::string("SoftmaxLossForwardGPU") + get_dtype_suffix(); + std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); @@ -328,7 +331,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, Dtype* counts) { - std::string kernel_name = std::string("SoftmaxLossBackwardGPU") + get_dtype_suffix(); + std::string kernel_name = "SoftmaxLossBackwardGPU" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); @@ -409,7 +412,7 @@ template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const template void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){ - std::string kernel_name = std::string("MaxPoolForward") + get_dtype_suffix(); + std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -443,7 +446,7 @@ template void MaxPoolForward(const int count, const double* bottom_data, template void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data) { - std::string kernel_name = std::string("StoPoolForwardTrain") + get_dtype_suffix(); + std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); @@ -471,7 +474,7 @@ template void StoPoolForwardTrain(const int count, const double* bottom_ template void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){ - std::string kernel_name = std::string("StoPoolForwardTest") + get_dtype_suffix(); + std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -500,7 +503,7 @@ template void StoPoolForwardTest(const int count, const double* bottom_d template void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){ - std::string kernel_name = std::string("AvePoolForward") + get_dtype_suffix(); + std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); @@ -580,7 +583,7 @@ template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const d template void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){ - std::string kernel_name = std::string("MaxPoolBackward") + get_dtype_suffix(); + std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); @@ -613,7 +616,7 @@ template void MaxPoolBackward(const int nthreads, const double* const to template void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff) { - std::string kernel_name = std::string("AvePoolBackward") + get_dtype_suffix(); + std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -643,7 +646,7 @@ template void AvePoolBackward(const int nthreads, const double* const to template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){ - std::string kernel_name = std::string("StoPoolBackward") + get_dtype_suffix(); + std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); @@ -696,7 +699,7 @@ template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const d template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ - std::string kernel_name = std::string("ReLUForward") + get_dtype_suffix(); + std::string kernel_name = "ReLUForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); @@ -714,7 +717,7 @@ template void ReLUForward(const int count, const double* bottom_data, do template void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ - std::string kernel_name = std::string("ReLUBackward") + get_dtype_suffix(); + std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -731,9 +734,12 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da } template void ReLUBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); template void ReLUBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); + template -void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels, +void opttrans(const Dtype* data_im, const int im_offset, const int channels, const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int num_kernels = channels * height * width * optnum; // To avoid involving atomic operations, we will launch one kernel per @@ -757,9 +763,9 @@ void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } -template void opttrans(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels, +template void opttrans(const float* data_im, const int im_offset, const int channels, const int height, const int width, float* data_opt, const int opt_offset, const int optnum); -template void opttrans(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels, +template void opttrans(const double* data_im, const int im_offset, const int channels, const int height, const int width, double* data_opt, const int opt_offset, const int optnum); template @@ -883,7 +889,7 @@ template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double template void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = std::string("div") + get_dtype_suffix(); + std::string kernel_name = "div" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); @@ -901,7 +907,7 @@ template void caffe_gpu_div (const int n, const double* a, const double* template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ - std::string kernel_name = std::string("add_scalar") + get_dtype_suffix(); + std::string kernel_name = "add_scalar" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); @@ -918,7 +924,7 @@ template void caffe_gpu_add_scalar (const int n, const double alpha, dou template void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = std::string("element_mul") + get_dtype_suffix(); + std::string kernel_name = "element_mul" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -937,7 +943,7 @@ template void caffe_gpu_mul (const int n, const double* a, const double* template void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ - std::string kernel_name = std::string("powx") + get_dtype_suffix(); + std::string kernel_name = "powx" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); @@ -956,7 +962,7 @@ template void caffe_gpu_powx (const int n, const double* a, const double template void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) { - std::string kernel_name = std::string("DropoutForward") + get_dtype_suffix(); + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -978,7 +984,7 @@ template void DropoutForward(const int count, const double* bottom_data, template void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) { - std::string kernel_name = std::string("DropoutBackward") + get_dtype_suffix(); + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); cl_int ret; From 77c1824894b3cf88b0a9769de09193306f8fff9e Mon Sep 17 00:00:00 2001 From: Yibing Date: Mon, 10 Aug 2015 01:40:47 +0800 Subject: [PATCH 033/124] fixed some kernel name errors and re-organized the kernel files --- include/caffe/util/ocl_wrapper.hpp | 10 + src/caffe/layers/prelu_layer.cpp | 62 ++++- src/caffe/ocl/im2col.cl | 89 +++---- src/caffe/ocl/prelu_layer.cl | 32 +++ src/caffe/ocl/{OCL_kernel.cl => random.cl} | 273 --------------------- src/caffe/ocl/softmax_layer.cl | 97 ++++++++ src/caffe/ocl/softmaxwithloss_layer.cl | 14 +- src/caffe/ocl/util.cl | 136 ++++++++++ src/caffe/util/im2col.cpp | 123 +--------- src/caffe/util/ocl_wrapper.cpp | 57 +++++ 10 files changed, 434 insertions(+), 459 deletions(-) create mode 100644 src/caffe/ocl/prelu_layer.cl rename src/caffe/ocl/{OCL_kernel.cl => random.cl} (59%) create mode 100644 src/caffe/ocl/util.cl diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index dbe2eb49..7109bfd1 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -67,6 +67,16 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff); + template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 4db0dc7c..ed51ac5e 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -132,14 +132,73 @@ void PReLULayer::Backward_cpu(const vector*>& top, template void PReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + const int div_factor = channel_shared_ ? channels : 1; + + if (top[0] == bottom[0]) { + caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + } + PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor); } template void PReLULayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ -} + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.gpu_data(); + } + // Propagate to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); + int cdim = channels * dim; + Dtype dsum = 0.; + for (int n = 0; n < bottom[0]->num(); ++n) { + // compute element-wise diff + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUParamBackward( + cdim, top_diff + top[0]->offset(n), + bottom_data + bottom[0]->offset(n), + backward_buff_.mutable_gpu_diff()); + if (channel_shared_) { + Dtype d; + caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), + multiplier_.gpu_data(), &d); + dsum += d; + } else { + caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., + backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., + slope_diff); + } + } + if (channel_shared_) { + caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + int div_factor = channel_shared_ ? channels : 1; + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data, + div_factor); + } +} #ifdef CPU_ONLY STUB_GPU(PReLULayer); @@ -147,5 +206,4 @@ STUB_GPU(PReLULayer); INSTANTIATE_CLASS(PReLULayer); REGISTER_LAYER_CLASS(PReLU); - } // namespace caffe diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 728f8dd3..77367fa6 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -186,37 +186,6 @@ __kernel void col2im(const int n, __global T* data_col, const int col_offset, co template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); -template -__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index;index= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col += height_col *width_col; - } - } - } -} - -template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); -template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); - template __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){ int index = get_global_id(0); @@ -248,36 +217,6 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); - -template -__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < n; index += tmp){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} -template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); -template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); - template __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){ @@ -296,3 +235,31 @@ __kernel void opttrans(const int n, __global T* data_im, const int im_offset, co } template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); + +template +__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; +} +template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); + +template +__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0 ; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; +} +template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl new file mode 100644 index 00000000..83724d1a --- /dev/null +++ b/src/caffe/ocl/prelu_layer.cl @@ -0,0 +1,32 @@ +template +__kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) { + int index = get_global_id(0); + if(index < count){ + int c = (index / dim) % channels / div_factor; + out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; + } +} +template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor); +template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor); + +template +__kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) { + int index = get_global_id(0); + if(index < count){ + int c = (index / dim) % channels / div_factor; + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + + (in_data[index] <= 0) * slope_data[c]); + } +} +template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor); +template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); + +template +__kernel void PReLUParamBackward(const int count, __global T* in_diff, __global T* in_data, __global T* out_diff) { + int index = get_global_id(0); + if(index < count){ + out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); + } +} +template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/random.cl similarity index 59% rename from src/caffe/ocl/OCL_kernel.cl rename to src/caffe/ocl/random.cl index a014a5cf..4980f8d2 100644 --- a/src/caffe/ocl/OCL_kernel.cl +++ b/src/caffe/ocl/random.cl @@ -724,276 +724,3 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t //end of the looooooong gpu_random_generator kernel - -template -__kernel void OCL_memset(__global T* buffer, const T value, const int size){ - int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; - } -} - -template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); -template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); - -__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ - int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; - } -} - -template -__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ - int gdx = get_global_id(0); - if(gdx < N){ - Y[gdx] =((0.0 -__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) { - T maxval = -FLT_MAX; - for (int i = 0; i < dim; i++) - maxval = max( data[index*dim + i], maxval ); - out[index] = maxval; - } -} - -template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); -template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); - -template -__kernel void exp (const int num, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) - out[index] = exp(data[index]); -} - -template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); -template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); - - - -template -__kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } -} - -template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* out); -template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* out); - -template -__kernel void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_max, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } -} - -template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); -template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); - -template -__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = exp(data[index]); - } -} - -template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); -template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); - -template -__kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* channel_sum) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } -} - -template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* channel_sum); -template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* channel_sum); - -template -__kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_sum, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } -} - -template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const float* channel_sum, __global float* data); -template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const double* channel_sum, __global double* data); - -template -__kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const T* data_1, __global const T* data_2, - __global T* channel_dot) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } -} - -template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const float* data_1, __global const float* data_2, - __global float* channel_dot); -template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const double* data_1, __global const double* data_2, - __global double* channel_dot); - - - -template -__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ - int index = get_global_id(0); - int total = get_global_size(0); - int offset; - for(index; index < num; index += total){ - offset = (int) label[index]; - data[index * dim + offset] -= 1; - } -} - -template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); -template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); - -template -__kernel void scal (const int num, const T alpha, __global T* data){ - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num; index += total){ - data[index] = data[index] * alpha; - } -} - -template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); -template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); - -template -__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] = a[index] / b[index]; -} - -template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); -//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); - -template -__kernel void add_scalar (const int n, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] += alpha; -} - -template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); -template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); - -template -__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ - int index = get_global_id(0); - if (index < n) - y[index] = in1[index] + in2[index] ; -} -template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); -template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); - -template -__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] = a[index] * b[index]; -} - -template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); -template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); - - -template -__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) -// y[index] = a[index] + alpha; - y[index] = pow(a[index], alpha); -} - -template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); -template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); - - -template -__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidyy = gidy; - int index = gidy / height; - int offset = index * width * height; - gidy = gidy % height; - if( gidx < width && gidyy < height * optnum ) - dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; -} -template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); -template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); - -template -__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ - int gidx = get_global_id(0); - int index; - index = (optnum==1) ? 0: gidx % optnum; - dst = dst + top_offset; // now we point at (*top)[n] - int offset = gidx / optnum; - int i = 0; - for(i = 0 ; i < width; i++) - dst[(index * height + offset)* width + i] = src[gidx * width + i]; -} -template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); -template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl index 711e4334..ef1255a4 100644 --- a/src/caffe/ocl/softmax_layer.cl +++ b/src/caffe/ocl/softmax_layer.cl @@ -46,3 +46,100 @@ __kernel void softmax_div (const int num, const int dim, __global T* scale, __gl template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data); + +template +__kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } +} +template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); + +template +__kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } +} + +template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* channel_sum); +template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* channel_sum); + +template +__kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); + +template +__kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } +} + +template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); +template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index 97eb6874..cec6346b 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -59,7 +59,19 @@ template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel v int spatial_dim, bool has_ignore_label_, int ignore_label_, float* counts); -template __attribute__ ((mangled_name(SoftmaxLossBackward_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, +template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, __global double* label,__global double* bottom_diff, int num, int dim, int spatial_dim, bool has_ignore_label_, int ignore_label_, double* counts); + +template +__kernel void scal (const int num, const T alpha, __global T* data){ + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total){ + data[index] = data[index] * alpha; + } +} + +template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); +template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl new file mode 100644 index 00000000..eb01fae9 --- /dev/null +++ b/src/caffe/ocl/util.cl @@ -0,0 +1,136 @@ +#pragma OPENCL EXTENSION cl_amd_printf : enable + +template +__kernel void OCL_memset(__global T* buffer, const T value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); +template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); + +__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ + int gdx = get_global_id(0); + if(gdx < size){ + buffer[gdx] = value; + } +} + +template +__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ + int gdx = get_global_id(0); + if(gdx < N){ + Y[gdx] =((0.0 +__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); +template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); + +template +__kernel void exp (const int num, __global T* data, __global T* out){ + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); +} + +template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); +template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); + + +template +__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); + + +template +__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total){ + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } +} + +template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); +template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); + + +template +__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] / b[index]; +} + +template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); +//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); + +template +__kernel void add_scalar (const int n, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] += alpha; +} + +template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); +template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); + +template +__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index] ; +} +template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); +template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); + +template +__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; +} + +template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); +template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); + + +template +__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ + int index = get_global_id(0); + if (index < n) +// y[index] = a[index] + alpha; + y[index] = pow(a[index], alpha); +} + +template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); +template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); + + diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index a5eb4176..e75c0d9a 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -257,43 +257,6 @@ template void im2col_gpu(cl_kernel Kernel, const double* data_im, const const int height, const int width, const int ksize, const int pad, const int stride, double* data_col, const int col_offset); -template -void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = 16 * channels * height_col * width_col; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {num_kernels}; - size_t uiLocal_Work_Size[] = {256 - 256 % width_col}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} - -template void im2col_16_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset); -template void im2col_16_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset); - template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, const int pad, @@ -339,7 +302,7 @@ template void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_im, const int img_offset) { - std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad - ksize) / stride + 1; @@ -378,89 +341,5 @@ template void col2im_gpu(const double* data_col, const int col_offset, c const int height, const int width, const int psize, const int pad, const int stride, double* data_im, const int img_offset); -template -void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, cl_kernel Kernel) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col); - OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) ); - - if(ret!=CL_SUCCESS){ - fprintf(stderr,"Failed to Set Args\n"); - } - - size_t uiGlobal_Work_Size[] = {num_kernels}; - size_t uiLocal_Work_Size[] = {64}; - cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL); - if(CL_SUCCESS!=iStatus){ - fprintf(stderr,"Failed to enqueue kernel\n"); - } -} - -template void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, cl_kernel Kernel); -template void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, cl_kernel Kernel); - -template -void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, cl_kernel Kernel) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operatiors) - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_im); - - if(ret!=CL_SUCCESS){ - fprintf(stderr,"Failed to Set Args\n"); - } - - size_t uiGlobal_Work_Size[] = {num_kernels}; - size_t uiLocal_Work_Size[] = {64}; - cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL); - if(CL_SUCCESS!=iStatus){ - fprintf(stderr,"Failed to enqueue kernel\n"); - } -} - - -template void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im, cl_kernel Kernel); -template void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im, cl_kernel Kernel); } // namespace caffe diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index ac1d9958..1bdd4320 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -697,6 +697,63 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor){ + std::string kernel_name = "PReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor); + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUForward(const int count, const int channels, const int dim,const float* bottom_data, float* top_data, const float* slope_data, const int div_factor); +template void PReLUForward(const int count, const int channels, const int dim,const double* bottom_data, double* top_data, const double* slope_data, const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor){ + std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor); + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUBackward(const int count, const int channels, const int dim, const float* top_diff, const float* bottom_data, float* bottom_diff, const float* slope_data, const int div_factor); +template void PReLUBackward(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){ + std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + size_t Global_Work_Size[] = {count * 1}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUParamBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff); +template void PReLUParamBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff); + + template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ std::string kernel_name = "ReLUForward" + get_dtype_suffix(); From cdd4d9debaf6377bcf9dc421f8afab6d032a164b Mon Sep 17 00:00:00 2001 From: Yibing Date: Mon, 10 Aug 2015 15:38:03 +0800 Subject: [PATCH 034/124] add AMD's license --- include/caffe/device.hpp | 26 ++++++++++++++++++++++++++ include/caffe/syncedmem.hpp | 26 ++++++++++++++++++++++++++ include/caffe/util/im2col.hpp | 26 ++++++++++++++++++++++++++ include/caffe/util/math_functions.hpp | 26 +++++++++++++++++++++++++- include/caffe/util/ocl_util.hpp | 26 +++++++++++++++++++++++++- include/caffe/util/ocl_wrapper.hpp | 26 +++++++++++++++++++++++++- src/caffe/device.cpp | 26 ++++++++++++++++++++++++++ src/caffe/layers/conv_layer.cpp | 1 + src/caffe/ocl/dropout_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/im2col.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/lrn_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/pooling_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/prelu_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/random.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/relu_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/softmax_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/softmaxwithloss_layer.cl | 26 ++++++++++++++++++++++++++ src/caffe/ocl/util.cl | 26 ++++++++++++++++++++++++++ src/caffe/syncedmem.cpp | 26 ++++++++++++++++++++++++++ src/caffe/util/im2col.cpp | 26 ++++++++++++++++++++++++++ src/caffe/util/math_functions.cpp | 26 +++++++++++++++++++++++++- src/caffe/util/ocl_util.cpp | 26 +++++++++++++++++++++++++- src/caffe/util/ocl_wrapper.cpp | 26 +++++++++++++++++++++++++- 23 files changed, 567 insertions(+), 6 deletions(-) diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index cea343e8..6561ec48 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef CAFFE_DEVICE_HPP #define CAFFE_DEVICE_HPP #include diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 2cb316fb..0fe6546d 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef CAFFE_SYNCEDMEM_HPP_ #define CAFFE_SYNCEDMEM_HPP_ diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index aec9e330..ba9c4aca 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef _CAFFE_UTIL_IM2COL_HPP_ #define _CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index a5ca6470..1dae00e0 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -1,4 +1,28 @@ -// Copyright 2014 BVLC and contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ #define CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 25747702..2e56101e 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -1,4 +1,28 @@ -// Copyright 2014 AMD DNN contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #ifndef _CAFFE_UTIL_OCL_UTIL_HPP_ #define _CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 7109bfd1..7351f8bc 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -1,4 +1,28 @@ -// Copyright 2014 AMD DNN contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_ #define _CAFFE_UTIL_OCL_WRAPPER_HPP_ diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 23c3789b..3ce6cefe 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include "caffe/common.hpp" #include "caffe/device.hpp" #include diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 48b7afe9..855c00e1 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -80,6 +80,7 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, Forward_gpu_opt(bottom, top); else Forward_gpu_org(bottom, top); + CHECK_BLOB_DATA(top[0],20, "top[0]"); } template diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl index 8d3db447..4bfa39bc 100644 --- a/src/caffe/ocl/dropout_layer.cl +++ b/src/caffe/ocl/dropout_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){ int index = get_global_id(0); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 77367fa6..3e535d5f 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ int index=get_global_id(0); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl index 901b5b13..ae1c9269 100644 --- a/src/caffe/ocl/lrn_layer.cl +++ b/src/caffe/ocl/lrn_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { int index = get_global_id(0); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 80289b68..d94efcba 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ int index = get_global_id(0); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index 83724d1a..be85a2e4 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) { int index = get_global_id(0); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index 4980f8d2..f5a7a4db 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #pragma OPENCL EXTENSION cl_amd_printf : enable //beginning of the looooooong gpu_random_generator kernel diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index df26d66e..d3b36a34 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ int index = get_global_id(0); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl index ef1255a4..6b225283 100644 --- a/src/caffe/ocl/softmax_layer.cl +++ b/src/caffe/ocl/softmax_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){ diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index cec6346b..9dbe284f 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void SoftmaxLossForwardGPU(const int nthreads, __global T* prob_data, __global T* label,__global T* loss, diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index eb01fae9..55026603 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #pragma OPENCL EXTENSION cl_amd_printf : enable template diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index ac1187b9..123b0053 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include #include "caffe/common.hpp" diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index e75c0d9a..29c6c1f9 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include #include #include diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 54e0abdc..677afcdf 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -1,4 +1,28 @@ -// Copyright 2014 BVLC and contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #include #include diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 044f9e69..01c04711 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -1,4 +1,28 @@ -// Copyright 2014 AMD DNN contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #include #include diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 1bdd4320..a9abda2e 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1,4 +1,28 @@ -// Copyright 2014 AMD DNN contributors. +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ #include #include From ed958d8e77c32e71af607daed2e02a25aa61684e Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 25 Aug 2015 13:53:08 +0800 Subject: [PATCH 035/124] This is a test layer --- include/caffe/common.hpp | 7 +- include/caffe/device.hpp | 6 +- include/caffe/util/math_functions.hpp | 15 + include/caffe/util/ocl_wrapper.hpp | 14 + .../solver_without_dropout.prototxt | 14 + .../train_val_without_dropout.prototxt | 366 +++ .../CMakeDirectoryInformation.cmake | 16 + .../CMakeFiles/caffe.dir/DependInfo.cmake | 108 + src/caffe/CMakeFiles/caffe.dir/build.make | 2542 +++++++++++++++++ .../CMakeFiles/caffe.dir/cmake_clean.cmake | 126 + src/caffe/CMakeFiles/caffe.dir/depend.make | 2 + src/caffe/CMakeFiles/caffe.dir/flags.make | 8 + src/caffe/CMakeFiles/caffe.dir/link.txt | 1 + src/caffe/CMakeFiles/caffe.dir/progress.make | 118 + ..._compile_generated_absval_layer.cu.o.cmake | 296 ++ ...compile_generated_absval_layer.cu.o.depend | 1 + ...mpile_generated_base_data_layer.cu.o.cmake | 296 ++ ...pile_generated_base_data_layer.cu.o.depend | 1 + ...da_compile_generated_bnll_layer.cu.o.cmake | 296 ++ ...a_compile_generated_bnll_layer.cu.o.depend | 1 + ..._compile_generated_concat_layer.cu.o.cmake | 296 ++ ...compile_generated_concat_layer.cu.o.depend | 1 + ...enerated_contrastive_loss_layer.cu.o.cmake | 296 ++ ...nerated_contrastive_loss_layer.cu.o.depend | 1 + ...da_compile_generated_conv_layer.cu.o.cmake | 296 ++ ...a_compile_generated_conv_layer.cu.o.depend | 1 + ...pile_generated_cudnn_conv_layer.cu.o.cmake | 296 ++ ...ile_generated_cudnn_conv_layer.cu.o.depend | 1 + ...e_generated_cudnn_pooling_layer.cu.o.cmake | 296 ++ ..._generated_cudnn_pooling_layer.cu.o.depend | 1 + ...pile_generated_cudnn_relu_layer.cu.o.cmake | 296 ++ ...ile_generated_cudnn_relu_layer.cu.o.depend | 1 + ...e_generated_cudnn_sigmoid_layer.cu.o.cmake | 296 ++ ..._generated_cudnn_sigmoid_layer.cu.o.depend | 1 + ...e_generated_cudnn_softmax_layer.cu.o.cmake | 296 ++ ..._generated_cudnn_softmax_layer.cu.o.depend | 1 + ...pile_generated_cudnn_tanh_layer.cu.o.cmake | 296 ++ ...ile_generated_cudnn_tanh_layer.cu.o.depend | 1 + ..._compile_generated_deconv_layer.cu.o.cmake | 296 ++ ...compile_generated_deconv_layer.cu.o.depend | 1 + ...compile_generated_dropout_layer.cu.o.cmake | 296 ++ ...ompile_generated_dropout_layer.cu.o.depend | 1 + ...compile_generated_eltwise_layer.cu.o.cmake | 296 ++ ...ompile_generated_eltwise_layer.cu.o.depend | 1 + ..._generated_euclidean_loss_layer.cu.o.cmake | 296 ++ ...generated_euclidean_loss_layer.cu.o.depend | 1 + ...uda_compile_generated_exp_layer.cu.o.cmake | 296 ++ ...da_compile_generated_exp_layer.cu.o.depend | 1 + ..._compile_generated_filter_layer.cu.o.cmake | 296 ++ ...compile_generated_filter_layer.cu.o.depend | 1 + ...mpile_generated_hdf5_data_layer.cu.o.cmake | 296 ++ ...pile_generated_hdf5_data_layer.cu.o.depend | 1 + ...ile_generated_hdf5_output_layer.cu.o.cmake | 296 ++ ...le_generated_hdf5_output_layer.cu.o.depend | 1 + ..._compile_generated_im2col_layer.cu.o.cmake | 296 ++ ...compile_generated_im2col_layer.cu.o.depend | 1 + ...e_generated_inner_product_layer.cu.o.cmake | 296 ++ ..._generated_inner_product_layer.cu.o.depend | 1 + ...uda_compile_generated_log_layer.cu.o.cmake | 296 ++ ...da_compile_generated_log_layer.cu.o.depend | 1 + ...uda_compile_generated_lrn_layer.cu.o.cmake | 296 ++ ...da_compile_generated_lrn_layer.cu.o.depend | 1 + ...uda_compile_generated_mvn_layer.cu.o.cmake | 296 ++ ...da_compile_generated_mvn_layer.cu.o.depend | 1 + ...compile_generated_pooling_layer.cu.o.cmake | 296 ++ ...ompile_generated_pooling_layer.cu.o.depend | 1 + ...a_compile_generated_power_layer.cu.o.cmake | 296 ++ ..._compile_generated_power_layer.cu.o.depend | 1 + ...a_compile_generated_prelu_layer.cu.o.cmake | 296 ++ ..._compile_generated_prelu_layer.cu.o.depend | 1 + ...mpile_generated_reduction_layer.cu.o.cmake | 296 ++ ...pile_generated_reduction_layer.cu.o.depend | 1 + ...da_compile_generated_relu_layer.cu.o.cmake | 296 ++ ...a_compile_generated_relu_layer.cu.o.depend | 1 + ...igmoid_cross_entropy_loss_layer.cu.o.cmake | 296 ++ ...gmoid_cross_entropy_loss_layer.cu.o.depend | 470 +++ ...compile_generated_sigmoid_layer.cu.o.cmake | 296 ++ ...ompile_generated_sigmoid_layer.cu.o.depend | 468 +++ ...compile_generated_silence_layer.cu.o.cmake | 296 ++ ...ompile_generated_silence_layer.cu.o.depend | 1 + ...a_compile_generated_slice_layer.cu.o.cmake | 296 ++ ..._compile_generated_slice_layer.cu.o.depend | 1 + ...compile_generated_softmax_layer.cu.o.cmake | 296 ++ ...ompile_generated_softmax_layer.cu.o.depend | 1 + ...le_generated_softmax_loss_layer.cu.o.cmake | 296 ++ ...e_generated_softmax_loss_layer.cu.o.depend | 1 + ...a_compile_generated_split_layer.cu.o.cmake | 296 ++ ..._compile_generated_split_layer.cu.o.depend | 1 + ...da_compile_generated_tanh_layer.cu.o.cmake | 296 ++ ...a_compile_generated_tanh_layer.cu.o.depend | 1 + ...mpile_generated_threshold_layer.cu.o.cmake | 296 ++ ...pile_generated_threshold_layer.cu.o.depend | 1 + .../cuda_compile_generated_im2col.cu.o.cmake | 296 ++ .../cuda_compile_generated_im2col.cu.o.depend | 404 +++ ...ompile_generated_math_functions.cu.o.cmake | 296 ++ ...mpile_generated_math_functions.cu.o.depend | 744 +++++ src/caffe/CMakeFiles/progress.marks | 1 + .../CMakeFiles/proto.dir/CXX.includecache | 48 + .../CMakeFiles/proto.dir/DependInfo.cmake | 39 + src/caffe/CMakeFiles/proto.dir/build.make | 119 + .../CMakeFiles/proto.dir/cmake_clean.cmake | 13 + .../proto.dir/cmake_clean_target.cmake | 3 + .../CMakeFiles/proto.dir/depend.internal | 6 + src/caffe/CMakeFiles/proto.dir/depend.make | 6 + src/caffe/CMakeFiles/proto.dir/flags.make | 8 + src/caffe/CMakeFiles/proto.dir/link.txt | 2 + src/caffe/CMakeFiles/proto.dir/progress.make | 3 + src/caffe/Makefile | 2279 +++++++++++++++ src/caffe/cmake_install.cmake | 79 + src/caffe/common.cpp | 1 + src/caffe/device.cpp | 45 +- src/caffe/layers/conv_layer.cpp | 20 +- src/caffe/layers/softmax_loss_layer.cpp | 1 + src/caffe/net.cpp | 15 +- src/caffe/ocl/pooling_layer.cl | 4 +- src/caffe/ocl/util.cl | 1 + src/caffe/solver.cpp | 4 +- .../CMakeDirectoryInformation.cmake | 16 + ...le_generated_test_im2col_kernel.cu.o.cmake | 296 ++ ...e_generated_test_im2col_kernel.cu.o.depend | 1 + src/caffe/test/CMakeFiles/progress.marks | 1 + .../CMakeFiles/runtest.dir/DependInfo.cmake | 27 + .../test/CMakeFiles/runtest.dir/build.make | 69 + .../CMakeFiles/runtest.dir/cmake_clean.cmake | 8 + .../test/CMakeFiles/runtest.dir/progress.make | 1 + .../test.testbin.dir/DependInfo.cmake | 92 + .../CMakeFiles/test.testbin.dir/build.make | 1623 +++++++++++ .../test.testbin.dir/cmake_clean.cmake | 68 + .../CMakeFiles/test.testbin.dir/depend.make | 2 + .../CMakeFiles/test.testbin.dir/flags.make | 8 + .../test/CMakeFiles/test.testbin.dir/link.txt | 1 + .../CMakeFiles/test.testbin.dir/progress.make | 60 + src/caffe/test/Makefile | 1766 ++++++++++++ src/caffe/test/cmake_install.cmake | 34 + src/caffe/test/test_caffe_main.cpp | 12 +- src/caffe/util/benchmark.cpp | 47 +- src/caffe/util/math_functions.cpp | 26 + src/caffe/util/ocl_util.cpp | 1 + src/caffe/util/ocl_wrapper.cpp | 78 +- .../CMakeDirectoryInformation.cmake | 16 + .../CMakeFiles/gtest.dir/DependInfo.cmake | 32 + src/gtest/CMakeFiles/gtest.dir/build.make | 106 + .../CMakeFiles/gtest.dir/cmake_clean.cmake | 10 + .../gtest.dir/cmake_clean_target.cmake | 3 + src/gtest/CMakeFiles/gtest.dir/depend.make | 2 + src/gtest/CMakeFiles/gtest.dir/flags.make | 8 + src/gtest/CMakeFiles/gtest.dir/link.txt | 2 + src/gtest/CMakeFiles/gtest.dir/progress.make | 2 + src/gtest/CMakeFiles/progress.marks | 1 + src/gtest/Makefile | 212 ++ src/gtest/cmake_install.cmake | 34 + 151 files changed, 24881 insertions(+), 83 deletions(-) create mode 100644 models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt create mode 100644 models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt create mode 100644 src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake create mode 100644 src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake create mode 100644 src/caffe/CMakeFiles/caffe.dir/build.make create mode 100644 src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake create mode 100644 src/caffe/CMakeFiles/caffe.dir/depend.make create mode 100644 src/caffe/CMakeFiles/caffe.dir/flags.make create mode 100644 src/caffe/CMakeFiles/caffe.dir/link.txt create mode 100644 src/caffe/CMakeFiles/caffe.dir/progress.make create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend create mode 100644 src/caffe/CMakeFiles/progress.marks create mode 100644 src/caffe/CMakeFiles/proto.dir/CXX.includecache create mode 100644 src/caffe/CMakeFiles/proto.dir/DependInfo.cmake create mode 100644 src/caffe/CMakeFiles/proto.dir/build.make create mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake create mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake create mode 100644 src/caffe/CMakeFiles/proto.dir/depend.internal create mode 100644 src/caffe/CMakeFiles/proto.dir/depend.make create mode 100644 src/caffe/CMakeFiles/proto.dir/flags.make create mode 100644 src/caffe/CMakeFiles/proto.dir/link.txt create mode 100644 src/caffe/CMakeFiles/proto.dir/progress.make create mode 100644 src/caffe/Makefile create mode 100644 src/caffe/cmake_install.cmake create mode 100644 src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake create mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake create mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend create mode 100644 src/caffe/test/CMakeFiles/progress.marks create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/build.make create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/progress.make create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/build.make create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/depend.make create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/flags.make create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/link.txt create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/progress.make create mode 100644 src/caffe/test/Makefile create mode 100644 src/caffe/test/cmake_install.cmake create mode 100644 src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake create mode 100644 src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake create mode 100644 src/gtest/CMakeFiles/gtest.dir/build.make create mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake create mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake create mode 100644 src/gtest/CMakeFiles/gtest.dir/depend.make create mode 100644 src/gtest/CMakeFiles/gtest.dir/flags.make create mode 100644 src/gtest/CMakeFiles/gtest.dir/link.txt create mode 100644 src/gtest/CMakeFiles/gtest.dir/progress.make create mode 100644 src/gtest/CMakeFiles/progress.marks create mode 100644 src/gtest/Makefile create mode 100644 src/gtest/cmake_install.cmake diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 070513b5..4cd372a6 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -84,7 +84,7 @@ private:\ #define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -#define global_packing_N 16 +#define global_packing_N 32 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ @@ -231,7 +231,10 @@ class Caffe { // into the program since that may cause allocation of pinned memory being // freed in a non-pinned way, which may cause problems - I haven't verified // it personally but better to note it here in the header file. - inline static void set_mode(Brew mode) { Get().mode_ = mode; } + inline static void set_mode(Brew mode) { + Get().mode_ = mode; + amdDevice.Init(); + } // Sets the random seed of both boost and curand static void set_random_seed(const unsigned int seed); // Sets the device. Since we have cublas and curand stuff, set device also diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 6561ec48..31adcb5f 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -34,7 +34,7 @@ namespace caffe { class Device{ public: - Device():numPlatforms(0),numDevices(0){} + Device():numPlatforms(0),numDevices(0){ } ~Device(); cl_uint numPlatforms; cl_platform_id * platformIDs; @@ -57,7 +57,7 @@ class Device{ void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); void GetDeviceInfo(); - + void DeviceQuery(); void BuildProgram(std::string kernel_dir); template @@ -66,7 +66,7 @@ class Device{ void appendBitfield(T info, T value, std::string name, std::string &str); cl_kernel GetKernel(std::string kernel_name); - + void ReleaseKernels(); }; extern char* buildOption; extern Device amdDevice; diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 1dae00e0..381dd8fd 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -34,6 +34,7 @@ #include "glog/logging.h" #include "caffe/util/mkl_alternate.hpp" +#include "caffe/util/ocl_util.hpp" namespace caffe { @@ -115,6 +116,20 @@ void caffe_set(const int N, const Dtype alpha, Dtype *X); template void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); +inline void caffe_memset(const size_t N, const int alpha, void* X) { + memset(X, alpha, N); // NOLINT(caffe/alt_fn) +} + +inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { +#ifndef CPU_ONLY + ocl_memset((int*)X, alpha, N); +#else + NO_GPU; +#endif +} + +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); + template void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 7351f8bc..223e3278 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -33,6 +33,20 @@ typedef unsigned int uint32_t; //template //void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); +template inline std::string get_dtype_suffix() +{ + dtype x; + const char type = typeid(x).name()[0]; + std::string suffix; + switch(type){ + case 'i': suffix = "_int"; break; + case 'd': suffix = "_double"; break; + case 'f': + default: suffix = "_float"; + } + return suffix; +} + template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); diff --git a/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt new file mode 100644 index 00000000..37b1d0d3 --- /dev/null +++ b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt @@ -0,0 +1,14 @@ +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.1 +stepsize: 100000 +display: 1 +max_iter: 450000 +momentum: 0.9 +weight_decay: 0.0005 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU diff --git a/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt new file mode 100644 index 00000000..f269ca0d --- /dev/null +++ b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt @@ -0,0 +1,366 @@ +name: "AlexNet" +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 256 + backend: LMDB + } +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} diff --git a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake new file mode 100644 index 00000000..7bb0014c --- /dev/null +++ b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake @@ -0,0 +1,16 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Relative path conversion top directories. +SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe") +SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe") + +# Force unix paths in dependencies. +SET(CMAKE_FORCE_UNIX_PATHS 1) + + +# The C and CXX include file regular expressions for this directory. +SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$") +SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$") +SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN}) +SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN}) diff --git a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake new file mode 100644 index 00000000..1678bc46 --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake @@ -0,0 +1,108 @@ +# The set of languages for which implicit dependencies are needed: +SET(CMAKE_DEPENDS_LANGUAGES + "CXX" + ) +# The set of files for implicit dependencies of each language: +SET(CMAKE_DEPENDS_CHECK_CXX + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/blob.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/common.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/device.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/net.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/solver.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" + ) +SET(CMAKE_CXX_COMPILER_ID "GNU") + +# Preprocessor definitions for this target. +SET(CMAKE_TARGET_DEFINITIONS + "GTEST_USE_OWN_TR1_TUPLE" + ) + +# Targets to which this target links. +SET(CMAKE_TARGET_LINKED_INFO_FILES + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake" + ) + +# The include file search paths: +SET(CMAKE_C_TARGET_INCLUDE_PATH + "src" + "/usr/local/include" + "include" + "/usr/local/cuda/include" + "/usr/local/include/opencv" + "/usr/include/atlas" + "." + ) +SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/CMakeFiles/caffe.dir/build.make b/src/caffe/CMakeFiles/caffe.dir/build.make new file mode 100644 index 00000000..916913ae --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/build.make @@ -0,0 +1,2542 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# Include any dependencies generated for this target. +include src/caffe/CMakeFiles/caffe.dir/depend.make + +# Include the progress variables for this target. +include src/caffe/CMakeFiles/caffe.dir/progress.make + +# Include the compile flags for this target's objects. +include src/caffe/CMakeFiles/caffe.dir/flags.make + +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/util/math_functions.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/util/im2col.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/layers/cufiles/sigmoid_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/layers/cufiles/bnll_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/layers/cufiles/conv_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/layers/cufiles/pooling_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/layers/cufiles/log_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/layers/cufiles/reduction_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/layers/cufiles/silence_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/layers/cufiles/power_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/layers/cufiles/split_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/layers/cufiles/absval_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/layers/cufiles/hdf5_output_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/layers/cufiles/base_data_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/layers/cufiles/dropout_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/layers/cufiles/cudnn_tanh_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/layers/cufiles/relu_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/layers/cufiles/cudnn_conv_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/layers/cufiles/contrastive_loss_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/layers/cufiles/concat_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/layers/cufiles/softmax_loss_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/layers/cufiles/cudnn_softmax_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/layers/cufiles/inner_product_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/layers/cufiles/filter_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/layers/cufiles/prelu_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/layers/cufiles/im2col_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/layers/cufiles/hdf5_data_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/layers/cufiles/deconv_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/layers/cufiles/mvn_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/layers/cufiles/tanh_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/layers/cufiles/slice_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/layers/cufiles/threshold_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/layers/cufiles/lrn_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/layers/cufiles/eltwise_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/layers/cufiles/exp_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/layers/cufiles/euclidean_loss_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/layers/cufiles/cudnn_relu_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/layers/cufiles/cudnn_pooling_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/layers/cufiles/softmax_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake + +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake +src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake + +src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/common.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/common.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp + +src/caffe/CMakeFiles/caffe.dir/common.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/common.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp > CMakeFiles/caffe.dir/common.cpp.i + +src/caffe/CMakeFiles/caffe.dir/common.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/common.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp -o CMakeFiles/caffe.dir/common.cpp.s + +src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/common.cpp.o + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/blob.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/blob.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/blob.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp > CMakeFiles/caffe.dir/blob.cpp.i + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/blob.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp -o CMakeFiles/caffe.dir/blob.cpp.s + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/util/ocl_wrapper.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp > CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/util/im2col.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/im2col.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/im2col.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp > CMakeFiles/caffe.dir/util/im2col.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/im2col.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp -o CMakeFiles/caffe.dir/util/im2col.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/util/upgrade_proto.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp > CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/util/db_leveldb.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_leveldb.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp > CMakeFiles/caffe.dir/util/db_leveldb.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_leveldb.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/util/ocl_util.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_util.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_util.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp > CMakeFiles/caffe.dir/util/ocl_util.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_util.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp -o CMakeFiles/caffe.dir/util/ocl_util.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/util/insert_splits.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/insert_splits.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/insert_splits.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp > CMakeFiles/caffe.dir/util/insert_splits.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/insert_splits.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp -o CMakeFiles/caffe.dir/util/insert_splits.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/util/db_lmdb.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_lmdb.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp > CMakeFiles/caffe.dir/util/db_lmdb.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_lmdb.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/util/math_functions.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/math_functions.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp > CMakeFiles/caffe.dir/util/math_functions.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/math_functions.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp -o CMakeFiles/caffe.dir/util/math_functions.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/util/io.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/io.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp > CMakeFiles/caffe.dir/util/io.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/io.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp -o CMakeFiles/caffe.dir/util/io.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/util/cudnn.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/cudnn.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/cudnn.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp > CMakeFiles/caffe.dir/util/cudnn.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/cudnn.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp -o CMakeFiles/caffe.dir/util/cudnn.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/util/db.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp > CMakeFiles/caffe.dir/util/db.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp -o CMakeFiles/caffe.dir/util/db.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/util/benchmark.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/benchmark.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp > CMakeFiles/caffe.dir/util/benchmark.cpp.i + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/benchmark.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp -o CMakeFiles/caffe.dir/util/benchmark.cpp.s + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o + +src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/device.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/device.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/device.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp + +src/caffe/CMakeFiles/caffe.dir/device.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/device.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp > CMakeFiles/caffe.dir/device.cpp.i + +src/caffe/CMakeFiles/caffe.dir/device.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/device.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp -o CMakeFiles/caffe.dir/device.cpp.s + +src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/device.cpp.o + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/internal_thread.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/internal_thread.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp > CMakeFiles/caffe.dir/internal_thread.cpp.i + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/internal_thread.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp -o CMakeFiles/caffe.dir/internal_thread.cpp.s + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/data_transformer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/data_transformer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp > CMakeFiles/caffe.dir/data_transformer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/data_transformer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp -o CMakeFiles/caffe.dir/data_transformer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/net.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/net.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp + +src/caffe/CMakeFiles/caffe.dir/net.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/net.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp > CMakeFiles/caffe.dir/net.cpp.i + +src/caffe/CMakeFiles/caffe.dir/net.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/net.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp -o CMakeFiles/caffe.dir/net.cpp.s + +src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/net.cpp.o + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/solver.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_60) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/solver.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/solver.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp > CMakeFiles/caffe.dir/solver.cpp.i + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/solver.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp -o CMakeFiles/caffe.dir/solver.cpp.s + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/layer_factory.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_61) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layer_factory.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp > CMakeFiles/caffe.dir/layer_factory.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layer_factory.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp -o CMakeFiles/caffe.dir/layer_factory.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/syncedmem.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_62) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/syncedmem.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp > CMakeFiles/caffe.dir/syncedmem.cpp.i + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/syncedmem.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp -o CMakeFiles/caffe.dir/syncedmem.cpp.s + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/layers/deconv_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_63) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp > CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/layers/infogain_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_64) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp > CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/layers/log_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_65) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/log_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/log_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp > CMakeFiles/caffe.dir/layers/log_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/log_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp -o CMakeFiles/caffe.dir/layers/log_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/layers/base_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_66) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp > CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/layers/euclidean_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_67) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp > CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/layers/image_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_68) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp > CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/layers/sigmoid_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_69) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/layers/cudnn_softmax_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_70) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/layers/cudnn_tanh_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_71) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/layers/spp_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_72) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/spp_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp > CMakeFiles/caffe.dir/layers/spp_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/spp_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/layers/hdf5_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_73) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/layers/exp_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_74) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/exp_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp > CMakeFiles/caffe.dir/layers/exp_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/exp_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/layers/power_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_75) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/power_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp > CMakeFiles/caffe.dir/layers/power_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/power_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp -o CMakeFiles/caffe.dir/layers/power_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/layers/relu_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_76) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/relu_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp > CMakeFiles/caffe.dir/layers/relu_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/relu_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/layers/split_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_77) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/split_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp > CMakeFiles/caffe.dir/layers/split_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/split_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp -o CMakeFiles/caffe.dir/layers/split_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/layers/window_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_78) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp > CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/layers/dropout_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_79) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp > CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/layers/cudnn_sigmoid_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_80) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/layers/silence_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_81) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/silence_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp > CMakeFiles/caffe.dir/layers/silence_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/silence_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/layers/cudnn_pooling_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_82) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/layers/lrn_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_83) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp > CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/layers/memory_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_84) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp > CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/layers/mvn_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_85) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp > CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/layers/cudnn_relu_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_86) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/layers/slice_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_87) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/slice_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp > CMakeFiles/caffe.dir/layers/slice_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/slice_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/layers/pooling_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_88) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp > CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/layers/hdf5_output_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_89) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/layers/inner_product_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_90) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp > CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/layers/threshold_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_91) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp > CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/layers/reduction_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_92) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp > CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/layers/tanh_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_93) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp > CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/layers/prelu_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_94) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp > CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/layers/accuracy_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_95) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp > CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/layers/neuron_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_96) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp > CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/layers/absval_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_97) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/absval_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp > CMakeFiles/caffe.dir/layers/absval_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/absval_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/layers/loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_98) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp > CMakeFiles/caffe.dir/layers/loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/layers/softmax_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_99) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/layers/cudnn_conv_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_100) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_101) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/layers/concat_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_102) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/concat_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp > CMakeFiles/caffe.dir/layers/concat_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/concat_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/layers/hinge_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_103) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp > CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/layers/bnll_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_104) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp > CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/layers/flatten_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_105) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp > CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/layers/argmax_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_106) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp > CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/layers/filter_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_107) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/filter_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp > CMakeFiles/caffe.dir/layers/filter_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/filter_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/layers/dummy_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_108) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp > CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/layers/conv_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_109) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/conv_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp > CMakeFiles/caffe.dir/layers/conv_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/conv_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/layers/base_conv_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_110) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp > CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/layers/data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_111) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp > CMakeFiles/caffe.dir/layers/data_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp -o CMakeFiles/caffe.dir/layers/data_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/layers/softmax_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_112) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/layers/eltwise_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_113) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp > CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/layers/im2col_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_114) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp > CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/layers/multinomial_logistic_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_115) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp > CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/layers/contrastive_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_116) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp > CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/layers/reshape_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_117) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp > CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires: +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires + $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build +.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides + +src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o + +# Object files for target caffe +caffe_OBJECTS = \ +"CMakeFiles/caffe.dir/common.cpp.o" \ +"CMakeFiles/caffe.dir/blob.cpp.o" \ +"CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" \ +"CMakeFiles/caffe.dir/util/im2col.cpp.o" \ +"CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" \ +"CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" \ +"CMakeFiles/caffe.dir/util/ocl_util.cpp.o" \ +"CMakeFiles/caffe.dir/util/insert_splits.cpp.o" \ +"CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" \ +"CMakeFiles/caffe.dir/util/math_functions.cpp.o" \ +"CMakeFiles/caffe.dir/util/io.cpp.o" \ +"CMakeFiles/caffe.dir/util/cudnn.cpp.o" \ +"CMakeFiles/caffe.dir/util/db.cpp.o" \ +"CMakeFiles/caffe.dir/util/benchmark.cpp.o" \ +"CMakeFiles/caffe.dir/device.cpp.o" \ +"CMakeFiles/caffe.dir/internal_thread.cpp.o" \ +"CMakeFiles/caffe.dir/data_transformer.cpp.o" \ +"CMakeFiles/caffe.dir/net.cpp.o" \ +"CMakeFiles/caffe.dir/solver.cpp.o" \ +"CMakeFiles/caffe.dir/layer_factory.cpp.o" \ +"CMakeFiles/caffe.dir/syncedmem.cpp.o" \ +"CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/log_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/power_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/split_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/data_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" \ +"CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" + +# External object files for target caffe +caffe_EXTERNAL_OBJECTS = \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" + +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/common.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/device.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/net.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/build.make +lib/libcaffe.so: lib/libproto.a +lib/libcaffe.so: lib/libproto.a +lib/libcaffe.so: /usr/local/lib/libboost_system.so +lib/libcaffe.so: /usr/local/lib/libboost_thread.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libpthread.so +lib/libcaffe.so: /usr/local/lib/libglog.so +lib/libcaffe.so: /usr/local/lib/libgflags.a +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so +lib/libcaffe.so: /usr/local/lib/liblmdb.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so +lib/libcaffe.so: /usr/lib/libsnappy.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so +lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10 +lib/libcaffe.so: /usr/local/lib/libopencv_highgui.so.2.4.10 +lib/libcaffe.so: /usr/local/lib/libopencv_imgproc.so.2.4.10 +lib/libcaffe.so: /usr/lib/liblapack_atlas.so +lib/libcaffe.so: /usr/lib/libcblas.so +lib/libcaffe.so: /usr/lib/libatlas.so +lib/libcaffe.so: /usr/local/lib/libglog.so +lib/libcaffe.so: /usr/local/lib/libgflags.a +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so +lib/libcaffe.so: /usr/local/lib/liblmdb.so +lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so +lib/libcaffe.so: /usr/lib/libsnappy.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so +lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so +lib/libcaffe.so: /usr/lib/liblapack_atlas.so +lib/libcaffe.so: /usr/lib/libcblas.so +lib/libcaffe.so: /usr/lib/libatlas.so +lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10 +lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/link.txt + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX shared library ../../lib/libcaffe.so" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/caffe.dir/link.txt --verbose=$(VERBOSE) + +# Rule to build all files generated by this target. +src/caffe/CMakeFiles/caffe.dir/build: lib/libcaffe.so +.PHONY : src/caffe/CMakeFiles/caffe.dir/build + +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires +src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires +.PHONY : src/caffe/CMakeFiles/caffe.dir/requires + +src/caffe/CMakeFiles/caffe.dir/clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/caffe.dir/cmake_clean.cmake +.PHONY : src/caffe/CMakeFiles/caffe.dir/clean + +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o +src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake --color=$(COLOR) +.PHONY : src/caffe/CMakeFiles/caffe.dir/depend + diff --git a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake new file mode 100644 index 00000000..344db002 --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake @@ -0,0 +1,126 @@ +FILE(REMOVE_RECURSE + "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" + "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" + "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" + "CMakeFiles/caffe.dir/common.cpp.o" + "CMakeFiles/caffe.dir/blob.cpp.o" + "CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" + "CMakeFiles/caffe.dir/util/im2col.cpp.o" + "CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" + "CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" + "CMakeFiles/caffe.dir/util/ocl_util.cpp.o" + "CMakeFiles/caffe.dir/util/insert_splits.cpp.o" + "CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" + "CMakeFiles/caffe.dir/util/math_functions.cpp.o" + "CMakeFiles/caffe.dir/util/io.cpp.o" + "CMakeFiles/caffe.dir/util/cudnn.cpp.o" + "CMakeFiles/caffe.dir/util/db.cpp.o" + "CMakeFiles/caffe.dir/util/benchmark.cpp.o" + "CMakeFiles/caffe.dir/device.cpp.o" + "CMakeFiles/caffe.dir/internal_thread.cpp.o" + "CMakeFiles/caffe.dir/data_transformer.cpp.o" + "CMakeFiles/caffe.dir/net.cpp.o" + "CMakeFiles/caffe.dir/solver.cpp.o" + "CMakeFiles/caffe.dir/layer_factory.cpp.o" + "CMakeFiles/caffe.dir/syncedmem.cpp.o" + "CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/log_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/power_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/split_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/data_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" + "CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" + "../../lib/libcaffe.pdb" + "../../lib/libcaffe.so" +) + +# Per-language clean rules from dependency scanning. +FOREACH(lang CXX) + INCLUDE(CMakeFiles/caffe.dir/cmake_clean_${lang}.cmake OPTIONAL) +ENDFOREACH(lang) diff --git a/src/caffe/CMakeFiles/caffe.dir/depend.make b/src/caffe/CMakeFiles/caffe.dir/depend.make new file mode 100644 index 00000000..0b20d16b --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/depend.make @@ -0,0 +1,2 @@ +# Empty dependencies file for caffe. +# This may be replaced when dependencies are built. diff --git a/src/caffe/CMakeFiles/caffe.dir/flags.make b/src/caffe/CMakeFiles/caffe.dir/flags.make new file mode 100644 index 00000000..494d36e8 --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/flags.make @@ -0,0 +1,8 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# compile CXX with /usr/bin/c++ +CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -fPIC -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe + +CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE -Dcaffe_EXPORTS + diff --git a/src/caffe/CMakeFiles/caffe.dir/link.txt b/src/caffe/CMakeFiles/caffe.dir/link.txt new file mode 100644 index 00000000..603d461f --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/link.txt @@ -0,0 +1 @@ +/usr/bin/c++ -fPIC -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -shared -Wl,-soname,libcaffe.so -o ../../lib/libcaffe.so CMakeFiles/caffe.dir/common.cpp.o CMakeFiles/caffe.dir/blob.cpp.o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o CMakeFiles/caffe.dir/util/im2col.cpp.o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o CMakeFiles/caffe.dir/util/ocl_util.cpp.o CMakeFiles/caffe.dir/util/insert_splits.cpp.o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o CMakeFiles/caffe.dir/util/math_functions.cpp.o CMakeFiles/caffe.dir/util/io.cpp.o CMakeFiles/caffe.dir/util/cudnn.cpp.o CMakeFiles/caffe.dir/util/db.cpp.o CMakeFiles/caffe.dir/util/benchmark.cpp.o CMakeFiles/caffe.dir/device.cpp.o CMakeFiles/caffe.dir/internal_thread.cpp.o CMakeFiles/caffe.dir/data_transformer.cpp.o CMakeFiles/caffe.dir/net.cpp.o CMakeFiles/caffe.dir/solver.cpp.o CMakeFiles/caffe.dir/layer_factory.cpp.o CMakeFiles/caffe.dir/syncedmem.cpp.o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/log_layer.cpp.o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o CMakeFiles/caffe.dir/layers/power_layer.cpp.o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o CMakeFiles/caffe.dir/layers/split_layer.cpp.o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/data_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -L/usr/local/cuda/lib64 -L/usr/local/lib ../../lib/libproto.a ../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_core.so.2.4.10 /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 -llapack_atlas -lcblas -latlas /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so -llapack_atlas -lcblas -latlas /usr/local/lib/libopencv_core.so.2.4.10 -Wl,-rpath,/usr/local/cuda/lib64:/usr/local/lib::::::::::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/src/caffe/CMakeFiles/caffe.dir/progress.make b/src/caffe/CMakeFiles/caffe.dir/progress.make new file mode 100644 index 00000000..d53ba6a8 --- /dev/null +++ b/src/caffe/CMakeFiles/caffe.dir/progress.make @@ -0,0 +1,118 @@ +CMAKE_PROGRESS_1 = +CMAKE_PROGRESS_2 = 1 +CMAKE_PROGRESS_3 = +CMAKE_PROGRESS_4 = 2 +CMAKE_PROGRESS_5 = +CMAKE_PROGRESS_6 = 3 +CMAKE_PROGRESS_7 = +CMAKE_PROGRESS_8 = 4 +CMAKE_PROGRESS_9 = +CMAKE_PROGRESS_10 = 5 +CMAKE_PROGRESS_11 = +CMAKE_PROGRESS_12 = 6 +CMAKE_PROGRESS_13 = +CMAKE_PROGRESS_14 = 7 +CMAKE_PROGRESS_15 = +CMAKE_PROGRESS_16 = 8 +CMAKE_PROGRESS_17 = +CMAKE_PROGRESS_18 = 9 +CMAKE_PROGRESS_19 = +CMAKE_PROGRESS_20 = 10 +CMAKE_PROGRESS_21 = +CMAKE_PROGRESS_22 = 11 +CMAKE_PROGRESS_23 = +CMAKE_PROGRESS_24 = 12 +CMAKE_PROGRESS_25 = +CMAKE_PROGRESS_26 = 13 +CMAKE_PROGRESS_27 = +CMAKE_PROGRESS_28 = 14 +CMAKE_PROGRESS_29 = +CMAKE_PROGRESS_30 = 15 +CMAKE_PROGRESS_31 = +CMAKE_PROGRESS_32 = 16 +CMAKE_PROGRESS_33 = +CMAKE_PROGRESS_34 = 17 +CMAKE_PROGRESS_35 = +CMAKE_PROGRESS_36 = 18 +CMAKE_PROGRESS_37 = +CMAKE_PROGRESS_38 = 19 +CMAKE_PROGRESS_39 = +CMAKE_PROGRESS_40 = 20 +CMAKE_PROGRESS_41 = +CMAKE_PROGRESS_42 = 21 +CMAKE_PROGRESS_43 = +CMAKE_PROGRESS_44 = 22 +CMAKE_PROGRESS_45 = +CMAKE_PROGRESS_46 = 23 +CMAKE_PROGRESS_47 = +CMAKE_PROGRESS_48 = 24 +CMAKE_PROGRESS_49 = +CMAKE_PROGRESS_50 = 25 +CMAKE_PROGRESS_51 = +CMAKE_PROGRESS_52 = 26 +CMAKE_PROGRESS_53 = +CMAKE_PROGRESS_54 = 27 +CMAKE_PROGRESS_55 = +CMAKE_PROGRESS_56 = 28 +CMAKE_PROGRESS_57 = +CMAKE_PROGRESS_58 = 29 +CMAKE_PROGRESS_59 = +CMAKE_PROGRESS_60 = 30 +CMAKE_PROGRESS_61 = +CMAKE_PROGRESS_62 = 31 +CMAKE_PROGRESS_63 = +CMAKE_PROGRESS_64 = 32 +CMAKE_PROGRESS_65 = +CMAKE_PROGRESS_66 = 33 +CMAKE_PROGRESS_67 = 34 +CMAKE_PROGRESS_68 = +CMAKE_PROGRESS_69 = 35 +CMAKE_PROGRESS_70 = +CMAKE_PROGRESS_71 = 36 +CMAKE_PROGRESS_72 = +CMAKE_PROGRESS_73 = 37 +CMAKE_PROGRESS_74 = +CMAKE_PROGRESS_75 = 38 +CMAKE_PROGRESS_76 = +CMAKE_PROGRESS_77 = 39 +CMAKE_PROGRESS_78 = +CMAKE_PROGRESS_79 = 40 +CMAKE_PROGRESS_80 = +CMAKE_PROGRESS_81 = 41 +CMAKE_PROGRESS_82 = +CMAKE_PROGRESS_83 = 42 +CMAKE_PROGRESS_84 = +CMAKE_PROGRESS_85 = 43 +CMAKE_PROGRESS_86 = +CMAKE_PROGRESS_87 = 44 +CMAKE_PROGRESS_88 = +CMAKE_PROGRESS_89 = 45 +CMAKE_PROGRESS_90 = +CMAKE_PROGRESS_91 = 46 +CMAKE_PROGRESS_92 = +CMAKE_PROGRESS_93 = 47 +CMAKE_PROGRESS_94 = +CMAKE_PROGRESS_95 = 48 +CMAKE_PROGRESS_96 = +CMAKE_PROGRESS_97 = 49 +CMAKE_PROGRESS_98 = +CMAKE_PROGRESS_99 = 50 +CMAKE_PROGRESS_100 = +CMAKE_PROGRESS_101 = 51 +CMAKE_PROGRESS_102 = +CMAKE_PROGRESS_103 = 52 +CMAKE_PROGRESS_104 = +CMAKE_PROGRESS_105 = 53 +CMAKE_PROGRESS_106 = +CMAKE_PROGRESS_107 = 54 +CMAKE_PROGRESS_108 = +CMAKE_PROGRESS_109 = 55 +CMAKE_PROGRESS_110 = +CMAKE_PROGRESS_111 = 56 +CMAKE_PROGRESS_112 = +CMAKE_PROGRESS_113 = 57 +CMAKE_PROGRESS_114 = +CMAKE_PROGRESS_115 = 58 +CMAKE_PROGRESS_116 = +CMAKE_PROGRESS_117 = 59 + diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake new file mode 100644 index 00000000..2b3197e9 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/absval_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake new file mode 100644 index 00000000..5558d70f --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/base_data_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake new file mode 100644 index 00000000..ae71cc72 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/bnll_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake new file mode 100644 index 00000000..48e8560a --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/concat_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake new file mode 100644 index 00000000..c5f6dca9 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/contrastive_loss_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake new file mode 100644 index 00000000..311ad242 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/conv_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake new file mode 100644 index 00000000..06210cf1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_conv_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake new file mode 100644 index 00000000..8f7960d4 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_pooling_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake new file mode 100644 index 00000000..308889ee --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_relu_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake new file mode 100644 index 00000000..d65ebd00 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake new file mode 100644 index 00000000..806067ce --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_softmax_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake new file mode 100644 index 00000000..7ace65eb --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_tanh_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake new file mode 100644 index 00000000..bc67ea5b --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/deconv_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake new file mode 100644 index 00000000..5ff06e9f --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/dropout_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake new file mode 100644 index 00000000..44e91898 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/eltwise_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake new file mode 100644 index 00000000..98ee3de7 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/euclidean_loss_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake new file mode 100644 index 00000000..2402999e --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/exp_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake new file mode 100644 index 00000000..83a032df --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/filter_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake new file mode 100644 index 00000000..a88ed54d --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_data_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake new file mode 100644 index 00000000..252b9dfd --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_output_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake new file mode 100644 index 00000000..6bda58ec --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/im2col_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake new file mode 100644 index 00000000..eac6680c --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/inner_product_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake new file mode 100644 index 00000000..d18371a0 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/log_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake new file mode 100644 index 00000000..c3c715f8 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/lrn_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake new file mode 100644 index 00000000..663f4478 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/mvn_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake new file mode 100644 index 00000000..866d0f93 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/pooling_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake new file mode 100644 index 00000000..c6c30190 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/power_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake new file mode 100644 index 00000000..c64cff0e --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/prelu_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake new file mode 100644 index 00000000..b926deab --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/reduction_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake new file mode 100644 index 00000000..27fda108 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/relu_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake new file mode 100644 index 00000000..63d7ac68 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend new file mode 100644 index 00000000..a7e2268a --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend @@ -0,0 +1,470 @@ +# Generated by: make2cmake.cmake +SET(CUDA_NVCC_DEPEND + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu" + "/opt/clBLAS-private-april8/include/clBLAS-complex.h" + "/opt/clBLAS-private-april8/include/clBLAS.h" + "/usr/include/H5ACpublic.h" + "/usr/include/H5Apublic.h" + "/usr/include/H5Cpublic.h" + "/usr/include/H5Dpublic.h" + "/usr/include/H5Epubgen.h" + "/usr/include/H5Epublic.h" + "/usr/include/H5FDcore.h" + "/usr/include/H5FDdirect.h" + "/usr/include/H5FDfamily.h" + "/usr/include/H5FDlog.h" + "/usr/include/H5FDmpi.h" + "/usr/include/H5FDmpio.h" + "/usr/include/H5FDmpiposix.h" + "/usr/include/H5FDmulti.h" + "/usr/include/H5FDpublic.h" + "/usr/include/H5FDsec2.h" + "/usr/include/H5FDstdio.h" + "/usr/include/H5Fpublic.h" + "/usr/include/H5Gpublic.h" + "/usr/include/H5Ipublic.h" + "/usr/include/H5Lpublic.h" + "/usr/include/H5MMpublic.h" + "/usr/include/H5Opublic.h" + "/usr/include/H5Ppublic.h" + "/usr/include/H5Rpublic.h" + "/usr/include/H5Spublic.h" + "/usr/include/H5Tpublic.h" + "/usr/include/H5Zpublic.h" + "/usr/include/H5api_adpt.h" + "/usr/include/H5pubconf.h" + "/usr/include/H5public.h" + "/usr/include/H5version.h" + "/usr/include/_G_config.h" + "/usr/include/alloca.h" + "/usr/include/asm-generic/errno-base.h" + "/usr/include/asm-generic/errno.h" + "/usr/include/assert.h" + "/usr/include/atlas/cblas.h" + "/usr/include/c++/4.8/algorithm" + "/usr/include/c++/4.8/backward/auto_ptr.h" + "/usr/include/c++/4.8/backward/binders.h" + "/usr/include/c++/4.8/bits/algorithmfwd.h" + "/usr/include/c++/4.8/bits/allocator.h" + "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" + "/usr/include/c++/4.8/bits/basic_ios.h" + "/usr/include/c++/4.8/bits/basic_ios.tcc" + "/usr/include/c++/4.8/bits/basic_string.h" + "/usr/include/c++/4.8/bits/basic_string.tcc" + "/usr/include/c++/4.8/bits/char_traits.h" + "/usr/include/c++/4.8/bits/codecvt.h" + "/usr/include/c++/4.8/bits/concept_check.h" + "/usr/include/c++/4.8/bits/cpp_type_traits.h" + "/usr/include/c++/4.8/bits/cxxabi_forced.h" + "/usr/include/c++/4.8/bits/exception_defines.h" + "/usr/include/c++/4.8/bits/fstream.tcc" + "/usr/include/c++/4.8/bits/functexcept.h" + "/usr/include/c++/4.8/bits/ios_base.h" + "/usr/include/c++/4.8/bits/istream.tcc" + "/usr/include/c++/4.8/bits/locale_classes.h" + "/usr/include/c++/4.8/bits/locale_classes.tcc" + "/usr/include/c++/4.8/bits/locale_facets.h" + "/usr/include/c++/4.8/bits/locale_facets.tcc" + "/usr/include/c++/4.8/bits/localefwd.h" + "/usr/include/c++/4.8/bits/memoryfwd.h" + "/usr/include/c++/4.8/bits/move.h" + "/usr/include/c++/4.8/bits/ostream.tcc" + "/usr/include/c++/4.8/bits/ostream_insert.h" + "/usr/include/c++/4.8/bits/postypes.h" + "/usr/include/c++/4.8/bits/range_access.h" + "/usr/include/c++/4.8/bits/sstream.tcc" + "/usr/include/c++/4.8/bits/stl_algo.h" + "/usr/include/c++/4.8/bits/stl_algobase.h" + "/usr/include/c++/4.8/bits/stl_bvector.h" + "/usr/include/c++/4.8/bits/stl_construct.h" + "/usr/include/c++/4.8/bits/stl_function.h" + "/usr/include/c++/4.8/bits/stl_heap.h" + "/usr/include/c++/4.8/bits/stl_iterator.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" + "/usr/include/c++/4.8/bits/stl_map.h" + "/usr/include/c++/4.8/bits/stl_multimap.h" + "/usr/include/c++/4.8/bits/stl_multiset.h" + "/usr/include/c++/4.8/bits/stl_pair.h" + "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" + "/usr/include/c++/4.8/bits/stl_relops.h" + "/usr/include/c++/4.8/bits/stl_set.h" + "/usr/include/c++/4.8/bits/stl_tempbuf.h" + "/usr/include/c++/4.8/bits/stl_tree.h" + "/usr/include/c++/4.8/bits/stl_uninitialized.h" + "/usr/include/c++/4.8/bits/stl_vector.h" + "/usr/include/c++/4.8/bits/stream_iterator.h" + "/usr/include/c++/4.8/bits/streambuf.tcc" + "/usr/include/c++/4.8/bits/streambuf_iterator.h" + "/usr/include/c++/4.8/bits/stringfwd.h" + "/usr/include/c++/4.8/bits/vector.tcc" + "/usr/include/c++/4.8/cctype" + "/usr/include/c++/4.8/cfloat" + "/usr/include/c++/4.8/climits" + "/usr/include/c++/4.8/clocale" + "/usr/include/c++/4.8/cmath" + "/usr/include/c++/4.8/cstddef" + "/usr/include/c++/4.8/cstdio" + "/usr/include/c++/4.8/cstdlib" + "/usr/include/c++/4.8/cwchar" + "/usr/include/c++/4.8/cwctype" + "/usr/include/c++/4.8/cxxabi.h" + "/usr/include/c++/4.8/debug/debug.h" + "/usr/include/c++/4.8/exception" + "/usr/include/c++/4.8/ext/alloc_traits.h" + "/usr/include/c++/4.8/ext/atomicity.h" + "/usr/include/c++/4.8/ext/new_allocator.h" + "/usr/include/c++/4.8/ext/numeric_traits.h" + "/usr/include/c++/4.8/ext/type_traits.h" + "/usr/include/c++/4.8/fstream" + "/usr/include/c++/4.8/functional" + "/usr/include/c++/4.8/ios" + "/usr/include/c++/4.8/iosfwd" + "/usr/include/c++/4.8/iostream" + "/usr/include/c++/4.8/istream" + "/usr/include/c++/4.8/iterator" + "/usr/include/c++/4.8/map" + "/usr/include/c++/4.8/memory" + "/usr/include/c++/4.8/new" + "/usr/include/c++/4.8/ostream" + "/usr/include/c++/4.8/set" + "/usr/include/c++/4.8/sstream" + "/usr/include/c++/4.8/streambuf" + "/usr/include/c++/4.8/string" + "/usr/include/c++/4.8/typeinfo" + "/usr/include/c++/4.8/utility" + "/usr/include/c++/4.8/vector" + "/usr/include/ctype.h" + "/usr/include/endian.h" + "/usr/include/errno.h" + "/usr/include/features.h" + "/usr/include/getopt.h" + "/usr/include/google/protobuf/descriptor.h" + "/usr/include/google/protobuf/extension_set.h" + "/usr/include/google/protobuf/generated_enum_reflection.h" + "/usr/include/google/protobuf/generated_message_util.h" + "/usr/include/google/protobuf/message.h" + "/usr/include/google/protobuf/message_lite.h" + "/usr/include/google/protobuf/repeated_field.h" + "/usr/include/google/protobuf/stubs/common.h" + "/usr/include/google/protobuf/stubs/template_util.h" + "/usr/include/google/protobuf/stubs/type_traits.h" + "/usr/include/google/protobuf/unknown_field_set.h" + "/usr/include/hdf5.h" + "/usr/include/inttypes.h" + "/usr/include/libio.h" + "/usr/include/limits.h" + "/usr/include/linux/errno.h" + "/usr/include/linux/limits.h" + "/usr/include/locale.h" + "/usr/include/math.h" + "/usr/include/pthread.h" + "/usr/include/sched.h" + "/usr/include/stdc-predef.h" + "/usr/include/stdint.h" + "/usr/include/stdio.h" + "/usr/include/stdlib.h" + "/usr/include/string.h" + "/usr/include/time.h" + "/usr/include/unistd.h" + "/usr/include/wchar.h" + "/usr/include/wctype.h" + "/usr/include/x86_64-linux-gnu/asm/errno.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap.h" + "/usr/include/x86_64-linux-gnu/bits/confname.h" + "/usr/include/x86_64-linux-gnu/bits/endian.h" + "/usr/include/x86_64-linux-gnu/bits/environments.h" + "/usr/include/x86_64-linux-gnu/bits/errno.h" + "/usr/include/x86_64-linux-gnu/bits/huge_val.h" + "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" + "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" + "/usr/include/x86_64-linux-gnu/bits/inf.h" + "/usr/include/x86_64-linux-gnu/bits/local_lim.h" + "/usr/include/x86_64-linux-gnu/bits/locale.h" + "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" + "/usr/include/x86_64-linux-gnu/bits/mathdef.h" + "/usr/include/x86_64-linux-gnu/bits/mathinline.h" + "/usr/include/x86_64-linux-gnu/bits/nan.h" + "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" + "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" + "/usr/include/x86_64-linux-gnu/bits/sched.h" + "/usr/include/x86_64-linux-gnu/bits/select.h" + "/usr/include/x86_64-linux-gnu/bits/select2.h" + "/usr/include/x86_64-linux-gnu/bits/setjmp.h" + "/usr/include/x86_64-linux-gnu/bits/sigset.h" + "/usr/include/x86_64-linux-gnu/bits/stdio.h" + "/usr/include/x86_64-linux-gnu/bits/stdio2.h" + "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib.h" + "/usr/include/x86_64-linux-gnu/bits/string3.h" + "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" + "/usr/include/x86_64-linux-gnu/bits/time.h" + "/usr/include/x86_64-linux-gnu/bits/timex.h" + "/usr/include/x86_64-linux-gnu/bits/types.h" + "/usr/include/x86_64-linux-gnu/bits/typesizes.h" + "/usr/include/x86_64-linux-gnu/bits/unistd.h" + "/usr/include/x86_64-linux-gnu/bits/waitflags.h" + "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" + "/usr/include/x86_64-linux-gnu/bits/wchar.h" + "/usr/include/x86_64-linux-gnu/bits/wchar2.h" + "/usr/include/x86_64-linux-gnu/bits/wordsize.h" + "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs.h" + "/usr/include/x86_64-linux-gnu/sys/cdefs.h" + "/usr/include/x86_64-linux-gnu/sys/select.h" + "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" + "/usr/include/x86_64-linux-gnu/sys/types.h" + "/usr/include/xlocale.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/float.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" + "/usr/local/cuda-6.5/include/CL/cl.h" + "/usr/local/cuda-6.5/include/CL/cl_ext.h" + "/usr/local/cuda-6.5/include/CL/cl_platform.h" + "/usr/local/cuda-6.5/include/builtin_types.h" + "/usr/local/cuda-6.5/include/channel_descriptor.h" + "/usr/local/cuda-6.5/include/common_functions.h" + "/usr/local/cuda-6.5/include/cuComplex.h" + "/usr/local/cuda-6.5/include/cublas_api.h" + "/usr/local/cuda-6.5/include/cublas_v2.h" + "/usr/local/cuda-6.5/include/cuda.h" + "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_runtime.h" + "/usr/local/cuda-6.5/include/cuda_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_surface_types.h" + "/usr/local/cuda-6.5/include/cuda_texture_types.h" + "/usr/local/cuda-6.5/include/curand.h" + "/usr/local/cuda-6.5/include/device_functions.h" + "/usr/local/cuda-6.5/include/device_launch_parameters.h" + "/usr/local/cuda-6.5/include/device_types.h" + "/usr/local/cuda-6.5/include/driver_functions.h" + "/usr/local/cuda-6.5/include/driver_types.h" + "/usr/local/cuda-6.5/include/host_config.h" + "/usr/local/cuda-6.5/include/host_defines.h" + "/usr/local/cuda-6.5/include/math_functions.h" + "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" + "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_13_double_functions.h" + "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" + "/usr/local/cuda-6.5/include/surface_functions.h" + "/usr/local/cuda-6.5/include/surface_indirect_functions.h" + "/usr/local/cuda-6.5/include/surface_types.h" + "/usr/local/cuda-6.5/include/texture_fetch_functions.h" + "/usr/local/cuda-6.5/include/texture_indirect_functions.h" + "/usr/local/cuda-6.5/include/texture_types.h" + "/usr/local/cuda-6.5/include/vector_functions.h" + "/usr/local/cuda-6.5/include/vector_types.h" + "/usr/local/include/boost/assert.hpp" + "/usr/local/include/boost/checked_delete.hpp" + "/usr/local/include/boost/config.hpp" + "/usr/local/include/boost/config/compiler/gcc.hpp" + "/usr/local/include/boost/config/compiler/nvcc.hpp" + "/usr/local/include/boost/config/no_tr1/memory.hpp" + "/usr/local/include/boost/config/no_tr1/utility.hpp" + "/usr/local/include/boost/config/platform/linux.hpp" + "/usr/local/include/boost/config/posix_features.hpp" + "/usr/local/include/boost/config/select_compiler_config.hpp" + "/usr/local/include/boost/config/select_platform_config.hpp" + "/usr/local/include/boost/config/select_stdlib_config.hpp" + "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" + "/usr/local/include/boost/config/suffix.hpp" + "/usr/local/include/boost/config/user.hpp" + "/usr/local/include/boost/core/checked_delete.hpp" + "/usr/local/include/boost/core/demangle.hpp" + "/usr/local/include/boost/core/typeinfo.hpp" + "/usr/local/include/boost/current_function.hpp" + "/usr/local/include/boost/detail/sp_typeinfo.hpp" + "/usr/local/include/boost/detail/workaround.hpp" + "/usr/local/include/boost/exception/exception.hpp" + "/usr/local/include/boost/predef.h" + "/usr/local/include/boost/predef/architecture.h" + "/usr/local/include/boost/predef/architecture/alpha.h" + "/usr/local/include/boost/predef/architecture/arm.h" + "/usr/local/include/boost/predef/architecture/blackfin.h" + "/usr/local/include/boost/predef/architecture/convex.h" + "/usr/local/include/boost/predef/architecture/ia64.h" + "/usr/local/include/boost/predef/architecture/m68k.h" + "/usr/local/include/boost/predef/architecture/mips.h" + "/usr/local/include/boost/predef/architecture/parisc.h" + "/usr/local/include/boost/predef/architecture/ppc.h" + "/usr/local/include/boost/predef/architecture/pyramid.h" + "/usr/local/include/boost/predef/architecture/rs6k.h" + "/usr/local/include/boost/predef/architecture/sparc.h" + "/usr/local/include/boost/predef/architecture/superh.h" + "/usr/local/include/boost/predef/architecture/sys370.h" + "/usr/local/include/boost/predef/architecture/sys390.h" + "/usr/local/include/boost/predef/architecture/x86.h" + "/usr/local/include/boost/predef/architecture/x86/32.h" + "/usr/local/include/boost/predef/architecture/x86/64.h" + "/usr/local/include/boost/predef/architecture/z.h" + "/usr/local/include/boost/predef/compiler.h" + "/usr/local/include/boost/predef/compiler/borland.h" + "/usr/local/include/boost/predef/compiler/clang.h" + "/usr/local/include/boost/predef/compiler/comeau.h" + "/usr/local/include/boost/predef/compiler/compaq.h" + "/usr/local/include/boost/predef/compiler/diab.h" + "/usr/local/include/boost/predef/compiler/digitalmars.h" + "/usr/local/include/boost/predef/compiler/dignus.h" + "/usr/local/include/boost/predef/compiler/edg.h" + "/usr/local/include/boost/predef/compiler/ekopath.h" + "/usr/local/include/boost/predef/compiler/gcc.h" + "/usr/local/include/boost/predef/compiler/gcc_xml.h" + "/usr/local/include/boost/predef/compiler/greenhills.h" + "/usr/local/include/boost/predef/compiler/hp_acc.h" + "/usr/local/include/boost/predef/compiler/iar.h" + "/usr/local/include/boost/predef/compiler/ibm.h" + "/usr/local/include/boost/predef/compiler/intel.h" + "/usr/local/include/boost/predef/compiler/kai.h" + "/usr/local/include/boost/predef/compiler/llvm.h" + "/usr/local/include/boost/predef/compiler/metaware.h" + "/usr/local/include/boost/predef/compiler/metrowerks.h" + "/usr/local/include/boost/predef/compiler/microtec.h" + "/usr/local/include/boost/predef/compiler/mpw.h" + "/usr/local/include/boost/predef/compiler/palm.h" + "/usr/local/include/boost/predef/compiler/pgi.h" + "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" + "/usr/local/include/boost/predef/compiler/sunpro.h" + "/usr/local/include/boost/predef/compiler/tendra.h" + "/usr/local/include/boost/predef/compiler/visualc.h" + "/usr/local/include/boost/predef/compiler/watcom.h" + "/usr/local/include/boost/predef/detail/_cassert.h" + "/usr/local/include/boost/predef/detail/_exception.h" + "/usr/local/include/boost/predef/detail/comp_detected.h" + "/usr/local/include/boost/predef/detail/os_detected.h" + "/usr/local/include/boost/predef/detail/test.h" + "/usr/local/include/boost/predef/language.h" + "/usr/local/include/boost/predef/language/objc.h" + "/usr/local/include/boost/predef/language/stdc.h" + "/usr/local/include/boost/predef/language/stdcpp.h" + "/usr/local/include/boost/predef/library.h" + "/usr/local/include/boost/predef/library/c.h" + "/usr/local/include/boost/predef/library/c/_prefix.h" + "/usr/local/include/boost/predef/library/c/gnu.h" + "/usr/local/include/boost/predef/library/c/uc.h" + "/usr/local/include/boost/predef/library/c/vms.h" + "/usr/local/include/boost/predef/library/c/zos.h" + "/usr/local/include/boost/predef/library/std.h" + "/usr/local/include/boost/predef/library/std/_prefix.h" + "/usr/local/include/boost/predef/library/std/cxx.h" + "/usr/local/include/boost/predef/library/std/dinkumware.h" + "/usr/local/include/boost/predef/library/std/libcomo.h" + "/usr/local/include/boost/predef/library/std/modena.h" + "/usr/local/include/boost/predef/library/std/msl.h" + "/usr/local/include/boost/predef/library/std/roguewave.h" + "/usr/local/include/boost/predef/library/std/sgi.h" + "/usr/local/include/boost/predef/library/std/stdcpp3.h" + "/usr/local/include/boost/predef/library/std/stlport.h" + "/usr/local/include/boost/predef/library/std/vacpp.h" + "/usr/local/include/boost/predef/make.h" + "/usr/local/include/boost/predef/os.h" + "/usr/local/include/boost/predef/os/aix.h" + "/usr/local/include/boost/predef/os/amigaos.h" + "/usr/local/include/boost/predef/os/android.h" + "/usr/local/include/boost/predef/os/beos.h" + "/usr/local/include/boost/predef/os/bsd.h" + "/usr/local/include/boost/predef/os/bsd/bsdi.h" + "/usr/local/include/boost/predef/os/bsd/dragonfly.h" + "/usr/local/include/boost/predef/os/bsd/free.h" + "/usr/local/include/boost/predef/os/bsd/net.h" + "/usr/local/include/boost/predef/os/bsd/open.h" + "/usr/local/include/boost/predef/os/cygwin.h" + "/usr/local/include/boost/predef/os/hpux.h" + "/usr/local/include/boost/predef/os/ios.h" + "/usr/local/include/boost/predef/os/irix.h" + "/usr/local/include/boost/predef/os/linux.h" + "/usr/local/include/boost/predef/os/macos.h" + "/usr/local/include/boost/predef/os/os400.h" + "/usr/local/include/boost/predef/os/qnxnto.h" + "/usr/local/include/boost/predef/os/solaris.h" + "/usr/local/include/boost/predef/os/unix.h" + "/usr/local/include/boost/predef/os/vms.h" + "/usr/local/include/boost/predef/os/windows.h" + "/usr/local/include/boost/predef/other.h" + "/usr/local/include/boost/predef/other/endian.h" + "/usr/local/include/boost/predef/platform.h" + "/usr/local/include/boost/predef/platform/mingw.h" + "/usr/local/include/boost/predef/platform/windows_desktop.h" + "/usr/local/include/boost/predef/platform/windows_phone.h" + "/usr/local/include/boost/predef/platform/windows_runtime.h" + "/usr/local/include/boost/predef/platform/windows_store.h" + "/usr/local/include/boost/predef/version_number.h" + "/usr/local/include/boost/scoped_ptr.hpp" + "/usr/local/include/boost/shared_ptr.hpp" + "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" + "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" + "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" + "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp" + "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" + "/usr/local/include/boost/throw_exception.hpp" + "/usr/local/include/gflags/gflags.h" + "/usr/local/include/gflags/gflags_declare.h" + "/usr/local/include/glog/log_severity.h" + "/usr/local/include/glog/logging.h" + "/usr/local/include/glog/vlog_is_on.h" +) + diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake new file mode 100644 index 00000000..d7dfae88 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend new file mode 100644 index 00000000..f9de6105 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend @@ -0,0 +1,468 @@ +# Generated by: make2cmake.cmake +SET(CUDA_NVCC_DEPEND + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu" + "/opt/clBLAS-private-april8/include/clBLAS-complex.h" + "/opt/clBLAS-private-april8/include/clBLAS.h" + "/usr/include/H5ACpublic.h" + "/usr/include/H5Apublic.h" + "/usr/include/H5Cpublic.h" + "/usr/include/H5Dpublic.h" + "/usr/include/H5Epubgen.h" + "/usr/include/H5Epublic.h" + "/usr/include/H5FDcore.h" + "/usr/include/H5FDdirect.h" + "/usr/include/H5FDfamily.h" + "/usr/include/H5FDlog.h" + "/usr/include/H5FDmpi.h" + "/usr/include/H5FDmpio.h" + "/usr/include/H5FDmpiposix.h" + "/usr/include/H5FDmulti.h" + "/usr/include/H5FDpublic.h" + "/usr/include/H5FDsec2.h" + "/usr/include/H5FDstdio.h" + "/usr/include/H5Fpublic.h" + "/usr/include/H5Gpublic.h" + "/usr/include/H5Ipublic.h" + "/usr/include/H5Lpublic.h" + "/usr/include/H5MMpublic.h" + "/usr/include/H5Opublic.h" + "/usr/include/H5Ppublic.h" + "/usr/include/H5Rpublic.h" + "/usr/include/H5Spublic.h" + "/usr/include/H5Tpublic.h" + "/usr/include/H5Zpublic.h" + "/usr/include/H5api_adpt.h" + "/usr/include/H5pubconf.h" + "/usr/include/H5public.h" + "/usr/include/H5version.h" + "/usr/include/_G_config.h" + "/usr/include/alloca.h" + "/usr/include/asm-generic/errno-base.h" + "/usr/include/asm-generic/errno.h" + "/usr/include/assert.h" + "/usr/include/atlas/cblas.h" + "/usr/include/c++/4.8/algorithm" + "/usr/include/c++/4.8/backward/auto_ptr.h" + "/usr/include/c++/4.8/backward/binders.h" + "/usr/include/c++/4.8/bits/algorithmfwd.h" + "/usr/include/c++/4.8/bits/allocator.h" + "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" + "/usr/include/c++/4.8/bits/basic_ios.h" + "/usr/include/c++/4.8/bits/basic_ios.tcc" + "/usr/include/c++/4.8/bits/basic_string.h" + "/usr/include/c++/4.8/bits/basic_string.tcc" + "/usr/include/c++/4.8/bits/char_traits.h" + "/usr/include/c++/4.8/bits/codecvt.h" + "/usr/include/c++/4.8/bits/concept_check.h" + "/usr/include/c++/4.8/bits/cpp_type_traits.h" + "/usr/include/c++/4.8/bits/cxxabi_forced.h" + "/usr/include/c++/4.8/bits/exception_defines.h" + "/usr/include/c++/4.8/bits/fstream.tcc" + "/usr/include/c++/4.8/bits/functexcept.h" + "/usr/include/c++/4.8/bits/ios_base.h" + "/usr/include/c++/4.8/bits/istream.tcc" + "/usr/include/c++/4.8/bits/locale_classes.h" + "/usr/include/c++/4.8/bits/locale_classes.tcc" + "/usr/include/c++/4.8/bits/locale_facets.h" + "/usr/include/c++/4.8/bits/locale_facets.tcc" + "/usr/include/c++/4.8/bits/localefwd.h" + "/usr/include/c++/4.8/bits/memoryfwd.h" + "/usr/include/c++/4.8/bits/move.h" + "/usr/include/c++/4.8/bits/ostream.tcc" + "/usr/include/c++/4.8/bits/ostream_insert.h" + "/usr/include/c++/4.8/bits/postypes.h" + "/usr/include/c++/4.8/bits/range_access.h" + "/usr/include/c++/4.8/bits/sstream.tcc" + "/usr/include/c++/4.8/bits/stl_algo.h" + "/usr/include/c++/4.8/bits/stl_algobase.h" + "/usr/include/c++/4.8/bits/stl_bvector.h" + "/usr/include/c++/4.8/bits/stl_construct.h" + "/usr/include/c++/4.8/bits/stl_function.h" + "/usr/include/c++/4.8/bits/stl_heap.h" + "/usr/include/c++/4.8/bits/stl_iterator.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" + "/usr/include/c++/4.8/bits/stl_map.h" + "/usr/include/c++/4.8/bits/stl_multimap.h" + "/usr/include/c++/4.8/bits/stl_multiset.h" + "/usr/include/c++/4.8/bits/stl_pair.h" + "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" + "/usr/include/c++/4.8/bits/stl_relops.h" + "/usr/include/c++/4.8/bits/stl_set.h" + "/usr/include/c++/4.8/bits/stl_tempbuf.h" + "/usr/include/c++/4.8/bits/stl_tree.h" + "/usr/include/c++/4.8/bits/stl_uninitialized.h" + "/usr/include/c++/4.8/bits/stl_vector.h" + "/usr/include/c++/4.8/bits/stream_iterator.h" + "/usr/include/c++/4.8/bits/streambuf.tcc" + "/usr/include/c++/4.8/bits/streambuf_iterator.h" + "/usr/include/c++/4.8/bits/stringfwd.h" + "/usr/include/c++/4.8/bits/vector.tcc" + "/usr/include/c++/4.8/cctype" + "/usr/include/c++/4.8/climits" + "/usr/include/c++/4.8/clocale" + "/usr/include/c++/4.8/cmath" + "/usr/include/c++/4.8/cstddef" + "/usr/include/c++/4.8/cstdio" + "/usr/include/c++/4.8/cstdlib" + "/usr/include/c++/4.8/cwchar" + "/usr/include/c++/4.8/cwctype" + "/usr/include/c++/4.8/cxxabi.h" + "/usr/include/c++/4.8/debug/debug.h" + "/usr/include/c++/4.8/exception" + "/usr/include/c++/4.8/ext/alloc_traits.h" + "/usr/include/c++/4.8/ext/atomicity.h" + "/usr/include/c++/4.8/ext/new_allocator.h" + "/usr/include/c++/4.8/ext/numeric_traits.h" + "/usr/include/c++/4.8/ext/type_traits.h" + "/usr/include/c++/4.8/fstream" + "/usr/include/c++/4.8/functional" + "/usr/include/c++/4.8/ios" + "/usr/include/c++/4.8/iosfwd" + "/usr/include/c++/4.8/iostream" + "/usr/include/c++/4.8/istream" + "/usr/include/c++/4.8/iterator" + "/usr/include/c++/4.8/map" + "/usr/include/c++/4.8/memory" + "/usr/include/c++/4.8/new" + "/usr/include/c++/4.8/ostream" + "/usr/include/c++/4.8/set" + "/usr/include/c++/4.8/sstream" + "/usr/include/c++/4.8/streambuf" + "/usr/include/c++/4.8/string" + "/usr/include/c++/4.8/typeinfo" + "/usr/include/c++/4.8/utility" + "/usr/include/c++/4.8/vector" + "/usr/include/ctype.h" + "/usr/include/endian.h" + "/usr/include/errno.h" + "/usr/include/features.h" + "/usr/include/getopt.h" + "/usr/include/google/protobuf/descriptor.h" + "/usr/include/google/protobuf/extension_set.h" + "/usr/include/google/protobuf/generated_enum_reflection.h" + "/usr/include/google/protobuf/generated_message_util.h" + "/usr/include/google/protobuf/message.h" + "/usr/include/google/protobuf/message_lite.h" + "/usr/include/google/protobuf/repeated_field.h" + "/usr/include/google/protobuf/stubs/common.h" + "/usr/include/google/protobuf/stubs/template_util.h" + "/usr/include/google/protobuf/stubs/type_traits.h" + "/usr/include/google/protobuf/unknown_field_set.h" + "/usr/include/hdf5.h" + "/usr/include/inttypes.h" + "/usr/include/libio.h" + "/usr/include/limits.h" + "/usr/include/linux/errno.h" + "/usr/include/linux/limits.h" + "/usr/include/locale.h" + "/usr/include/math.h" + "/usr/include/pthread.h" + "/usr/include/sched.h" + "/usr/include/stdc-predef.h" + "/usr/include/stdint.h" + "/usr/include/stdio.h" + "/usr/include/stdlib.h" + "/usr/include/string.h" + "/usr/include/time.h" + "/usr/include/unistd.h" + "/usr/include/wchar.h" + "/usr/include/wctype.h" + "/usr/include/x86_64-linux-gnu/asm/errno.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap.h" + "/usr/include/x86_64-linux-gnu/bits/confname.h" + "/usr/include/x86_64-linux-gnu/bits/endian.h" + "/usr/include/x86_64-linux-gnu/bits/environments.h" + "/usr/include/x86_64-linux-gnu/bits/errno.h" + "/usr/include/x86_64-linux-gnu/bits/huge_val.h" + "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" + "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" + "/usr/include/x86_64-linux-gnu/bits/inf.h" + "/usr/include/x86_64-linux-gnu/bits/local_lim.h" + "/usr/include/x86_64-linux-gnu/bits/locale.h" + "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" + "/usr/include/x86_64-linux-gnu/bits/mathdef.h" + "/usr/include/x86_64-linux-gnu/bits/mathinline.h" + "/usr/include/x86_64-linux-gnu/bits/nan.h" + "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" + "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" + "/usr/include/x86_64-linux-gnu/bits/sched.h" + "/usr/include/x86_64-linux-gnu/bits/select.h" + "/usr/include/x86_64-linux-gnu/bits/select2.h" + "/usr/include/x86_64-linux-gnu/bits/setjmp.h" + "/usr/include/x86_64-linux-gnu/bits/sigset.h" + "/usr/include/x86_64-linux-gnu/bits/stdio.h" + "/usr/include/x86_64-linux-gnu/bits/stdio2.h" + "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib.h" + "/usr/include/x86_64-linux-gnu/bits/string3.h" + "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" + "/usr/include/x86_64-linux-gnu/bits/time.h" + "/usr/include/x86_64-linux-gnu/bits/timex.h" + "/usr/include/x86_64-linux-gnu/bits/types.h" + "/usr/include/x86_64-linux-gnu/bits/typesizes.h" + "/usr/include/x86_64-linux-gnu/bits/unistd.h" + "/usr/include/x86_64-linux-gnu/bits/waitflags.h" + "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" + "/usr/include/x86_64-linux-gnu/bits/wchar.h" + "/usr/include/x86_64-linux-gnu/bits/wchar2.h" + "/usr/include/x86_64-linux-gnu/bits/wordsize.h" + "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs.h" + "/usr/include/x86_64-linux-gnu/sys/cdefs.h" + "/usr/include/x86_64-linux-gnu/sys/select.h" + "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" + "/usr/include/x86_64-linux-gnu/sys/types.h" + "/usr/include/xlocale.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" + "/usr/local/cuda-6.5/include/CL/cl.h" + "/usr/local/cuda-6.5/include/CL/cl_ext.h" + "/usr/local/cuda-6.5/include/CL/cl_platform.h" + "/usr/local/cuda-6.5/include/builtin_types.h" + "/usr/local/cuda-6.5/include/channel_descriptor.h" + "/usr/local/cuda-6.5/include/common_functions.h" + "/usr/local/cuda-6.5/include/cuComplex.h" + "/usr/local/cuda-6.5/include/cublas_api.h" + "/usr/local/cuda-6.5/include/cublas_v2.h" + "/usr/local/cuda-6.5/include/cuda.h" + "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_runtime.h" + "/usr/local/cuda-6.5/include/cuda_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_surface_types.h" + "/usr/local/cuda-6.5/include/cuda_texture_types.h" + "/usr/local/cuda-6.5/include/curand.h" + "/usr/local/cuda-6.5/include/device_functions.h" + "/usr/local/cuda-6.5/include/device_launch_parameters.h" + "/usr/local/cuda-6.5/include/device_types.h" + "/usr/local/cuda-6.5/include/driver_functions.h" + "/usr/local/cuda-6.5/include/driver_types.h" + "/usr/local/cuda-6.5/include/host_config.h" + "/usr/local/cuda-6.5/include/host_defines.h" + "/usr/local/cuda-6.5/include/math_functions.h" + "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" + "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_13_double_functions.h" + "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" + "/usr/local/cuda-6.5/include/surface_functions.h" + "/usr/local/cuda-6.5/include/surface_indirect_functions.h" + "/usr/local/cuda-6.5/include/surface_types.h" + "/usr/local/cuda-6.5/include/texture_fetch_functions.h" + "/usr/local/cuda-6.5/include/texture_indirect_functions.h" + "/usr/local/cuda-6.5/include/texture_types.h" + "/usr/local/cuda-6.5/include/vector_functions.h" + "/usr/local/cuda-6.5/include/vector_types.h" + "/usr/local/include/boost/assert.hpp" + "/usr/local/include/boost/checked_delete.hpp" + "/usr/local/include/boost/config.hpp" + "/usr/local/include/boost/config/compiler/gcc.hpp" + "/usr/local/include/boost/config/compiler/nvcc.hpp" + "/usr/local/include/boost/config/no_tr1/memory.hpp" + "/usr/local/include/boost/config/no_tr1/utility.hpp" + "/usr/local/include/boost/config/platform/linux.hpp" + "/usr/local/include/boost/config/posix_features.hpp" + "/usr/local/include/boost/config/select_compiler_config.hpp" + "/usr/local/include/boost/config/select_platform_config.hpp" + "/usr/local/include/boost/config/select_stdlib_config.hpp" + "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" + "/usr/local/include/boost/config/suffix.hpp" + "/usr/local/include/boost/config/user.hpp" + "/usr/local/include/boost/core/checked_delete.hpp" + "/usr/local/include/boost/core/demangle.hpp" + "/usr/local/include/boost/core/typeinfo.hpp" + "/usr/local/include/boost/current_function.hpp" + "/usr/local/include/boost/detail/sp_typeinfo.hpp" + "/usr/local/include/boost/detail/workaround.hpp" + "/usr/local/include/boost/exception/exception.hpp" + "/usr/local/include/boost/predef.h" + "/usr/local/include/boost/predef/architecture.h" + "/usr/local/include/boost/predef/architecture/alpha.h" + "/usr/local/include/boost/predef/architecture/arm.h" + "/usr/local/include/boost/predef/architecture/blackfin.h" + "/usr/local/include/boost/predef/architecture/convex.h" + "/usr/local/include/boost/predef/architecture/ia64.h" + "/usr/local/include/boost/predef/architecture/m68k.h" + "/usr/local/include/boost/predef/architecture/mips.h" + "/usr/local/include/boost/predef/architecture/parisc.h" + "/usr/local/include/boost/predef/architecture/ppc.h" + "/usr/local/include/boost/predef/architecture/pyramid.h" + "/usr/local/include/boost/predef/architecture/rs6k.h" + "/usr/local/include/boost/predef/architecture/sparc.h" + "/usr/local/include/boost/predef/architecture/superh.h" + "/usr/local/include/boost/predef/architecture/sys370.h" + "/usr/local/include/boost/predef/architecture/sys390.h" + "/usr/local/include/boost/predef/architecture/x86.h" + "/usr/local/include/boost/predef/architecture/x86/32.h" + "/usr/local/include/boost/predef/architecture/x86/64.h" + "/usr/local/include/boost/predef/architecture/z.h" + "/usr/local/include/boost/predef/compiler.h" + "/usr/local/include/boost/predef/compiler/borland.h" + "/usr/local/include/boost/predef/compiler/clang.h" + "/usr/local/include/boost/predef/compiler/comeau.h" + "/usr/local/include/boost/predef/compiler/compaq.h" + "/usr/local/include/boost/predef/compiler/diab.h" + "/usr/local/include/boost/predef/compiler/digitalmars.h" + "/usr/local/include/boost/predef/compiler/dignus.h" + "/usr/local/include/boost/predef/compiler/edg.h" + "/usr/local/include/boost/predef/compiler/ekopath.h" + "/usr/local/include/boost/predef/compiler/gcc.h" + "/usr/local/include/boost/predef/compiler/gcc_xml.h" + "/usr/local/include/boost/predef/compiler/greenhills.h" + "/usr/local/include/boost/predef/compiler/hp_acc.h" + "/usr/local/include/boost/predef/compiler/iar.h" + "/usr/local/include/boost/predef/compiler/ibm.h" + "/usr/local/include/boost/predef/compiler/intel.h" + "/usr/local/include/boost/predef/compiler/kai.h" + "/usr/local/include/boost/predef/compiler/llvm.h" + "/usr/local/include/boost/predef/compiler/metaware.h" + "/usr/local/include/boost/predef/compiler/metrowerks.h" + "/usr/local/include/boost/predef/compiler/microtec.h" + "/usr/local/include/boost/predef/compiler/mpw.h" + "/usr/local/include/boost/predef/compiler/palm.h" + "/usr/local/include/boost/predef/compiler/pgi.h" + "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" + "/usr/local/include/boost/predef/compiler/sunpro.h" + "/usr/local/include/boost/predef/compiler/tendra.h" + "/usr/local/include/boost/predef/compiler/visualc.h" + "/usr/local/include/boost/predef/compiler/watcom.h" + "/usr/local/include/boost/predef/detail/_cassert.h" + "/usr/local/include/boost/predef/detail/_exception.h" + "/usr/local/include/boost/predef/detail/comp_detected.h" + "/usr/local/include/boost/predef/detail/os_detected.h" + "/usr/local/include/boost/predef/detail/test.h" + "/usr/local/include/boost/predef/language.h" + "/usr/local/include/boost/predef/language/objc.h" + "/usr/local/include/boost/predef/language/stdc.h" + "/usr/local/include/boost/predef/language/stdcpp.h" + "/usr/local/include/boost/predef/library.h" + "/usr/local/include/boost/predef/library/c.h" + "/usr/local/include/boost/predef/library/c/_prefix.h" + "/usr/local/include/boost/predef/library/c/gnu.h" + "/usr/local/include/boost/predef/library/c/uc.h" + "/usr/local/include/boost/predef/library/c/vms.h" + "/usr/local/include/boost/predef/library/c/zos.h" + "/usr/local/include/boost/predef/library/std.h" + "/usr/local/include/boost/predef/library/std/_prefix.h" + "/usr/local/include/boost/predef/library/std/cxx.h" + "/usr/local/include/boost/predef/library/std/dinkumware.h" + "/usr/local/include/boost/predef/library/std/libcomo.h" + "/usr/local/include/boost/predef/library/std/modena.h" + "/usr/local/include/boost/predef/library/std/msl.h" + "/usr/local/include/boost/predef/library/std/roguewave.h" + "/usr/local/include/boost/predef/library/std/sgi.h" + "/usr/local/include/boost/predef/library/std/stdcpp3.h" + "/usr/local/include/boost/predef/library/std/stlport.h" + "/usr/local/include/boost/predef/library/std/vacpp.h" + "/usr/local/include/boost/predef/make.h" + "/usr/local/include/boost/predef/os.h" + "/usr/local/include/boost/predef/os/aix.h" + "/usr/local/include/boost/predef/os/amigaos.h" + "/usr/local/include/boost/predef/os/android.h" + "/usr/local/include/boost/predef/os/beos.h" + "/usr/local/include/boost/predef/os/bsd.h" + "/usr/local/include/boost/predef/os/bsd/bsdi.h" + "/usr/local/include/boost/predef/os/bsd/dragonfly.h" + "/usr/local/include/boost/predef/os/bsd/free.h" + "/usr/local/include/boost/predef/os/bsd/net.h" + "/usr/local/include/boost/predef/os/bsd/open.h" + "/usr/local/include/boost/predef/os/cygwin.h" + "/usr/local/include/boost/predef/os/hpux.h" + "/usr/local/include/boost/predef/os/ios.h" + "/usr/local/include/boost/predef/os/irix.h" + "/usr/local/include/boost/predef/os/linux.h" + "/usr/local/include/boost/predef/os/macos.h" + "/usr/local/include/boost/predef/os/os400.h" + "/usr/local/include/boost/predef/os/qnxnto.h" + "/usr/local/include/boost/predef/os/solaris.h" + "/usr/local/include/boost/predef/os/unix.h" + "/usr/local/include/boost/predef/os/vms.h" + "/usr/local/include/boost/predef/os/windows.h" + "/usr/local/include/boost/predef/other.h" + "/usr/local/include/boost/predef/other/endian.h" + "/usr/local/include/boost/predef/platform.h" + "/usr/local/include/boost/predef/platform/mingw.h" + "/usr/local/include/boost/predef/platform/windows_desktop.h" + "/usr/local/include/boost/predef/platform/windows_phone.h" + "/usr/local/include/boost/predef/platform/windows_runtime.h" + "/usr/local/include/boost/predef/platform/windows_store.h" + "/usr/local/include/boost/predef/version_number.h" + "/usr/local/include/boost/scoped_ptr.hpp" + "/usr/local/include/boost/shared_ptr.hpp" + "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" + "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" + "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" + "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp" + "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" + "/usr/local/include/boost/throw_exception.hpp" + "/usr/local/include/gflags/gflags.h" + "/usr/local/include/gflags/gflags_declare.h" + "/usr/local/include/glog/log_severity.h" + "/usr/local/include/glog/logging.h" + "/usr/local/include/glog/vlog_is_on.h" +) + diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake new file mode 100644 index 00000000..dd2453ae --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/silence_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake new file mode 100644 index 00000000..990e0622 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/slice_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake new file mode 100644 index 00000000..ebf29ea2 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake new file mode 100644 index 00000000..6260b6e0 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_loss_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake new file mode 100644 index 00000000..ad49afe7 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/split_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake new file mode 100644 index 00000000..71fc8fdb --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/tanh_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake new file mode 100644 index 00000000..4e18059a --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/threshold_layer.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake new file mode 100644 index 00000000..8de5e27c --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend new file mode 100644 index 00000000..36db02fe --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend @@ -0,0 +1,404 @@ +# Generated by: make2cmake.cmake +SET(CUDA_NVCC_DEPEND + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu" + "/opt/clBLAS-private-april8/include/clBLAS-complex.h" + "/opt/clBLAS-private-april8/include/clBLAS.h" + "/usr/include/_G_config.h" + "/usr/include/alloca.h" + "/usr/include/asm-generic/errno-base.h" + "/usr/include/asm-generic/errno.h" + "/usr/include/assert.h" + "/usr/include/c++/4.8/algorithm" + "/usr/include/c++/4.8/backward/auto_ptr.h" + "/usr/include/c++/4.8/backward/binders.h" + "/usr/include/c++/4.8/bits/algorithmfwd.h" + "/usr/include/c++/4.8/bits/allocator.h" + "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" + "/usr/include/c++/4.8/bits/basic_ios.h" + "/usr/include/c++/4.8/bits/basic_ios.tcc" + "/usr/include/c++/4.8/bits/basic_string.h" + "/usr/include/c++/4.8/bits/basic_string.tcc" + "/usr/include/c++/4.8/bits/char_traits.h" + "/usr/include/c++/4.8/bits/codecvt.h" + "/usr/include/c++/4.8/bits/concept_check.h" + "/usr/include/c++/4.8/bits/cpp_type_traits.h" + "/usr/include/c++/4.8/bits/cxxabi_forced.h" + "/usr/include/c++/4.8/bits/exception_defines.h" + "/usr/include/c++/4.8/bits/fstream.tcc" + "/usr/include/c++/4.8/bits/functexcept.h" + "/usr/include/c++/4.8/bits/ios_base.h" + "/usr/include/c++/4.8/bits/istream.tcc" + "/usr/include/c++/4.8/bits/locale_classes.h" + "/usr/include/c++/4.8/bits/locale_classes.tcc" + "/usr/include/c++/4.8/bits/locale_facets.h" + "/usr/include/c++/4.8/bits/locale_facets.tcc" + "/usr/include/c++/4.8/bits/localefwd.h" + "/usr/include/c++/4.8/bits/memoryfwd.h" + "/usr/include/c++/4.8/bits/move.h" + "/usr/include/c++/4.8/bits/ostream.tcc" + "/usr/include/c++/4.8/bits/ostream_insert.h" + "/usr/include/c++/4.8/bits/postypes.h" + "/usr/include/c++/4.8/bits/range_access.h" + "/usr/include/c++/4.8/bits/sstream.tcc" + "/usr/include/c++/4.8/bits/stl_algo.h" + "/usr/include/c++/4.8/bits/stl_algobase.h" + "/usr/include/c++/4.8/bits/stl_bvector.h" + "/usr/include/c++/4.8/bits/stl_construct.h" + "/usr/include/c++/4.8/bits/stl_function.h" + "/usr/include/c++/4.8/bits/stl_heap.h" + "/usr/include/c++/4.8/bits/stl_iterator.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" + "/usr/include/c++/4.8/bits/stl_map.h" + "/usr/include/c++/4.8/bits/stl_multimap.h" + "/usr/include/c++/4.8/bits/stl_multiset.h" + "/usr/include/c++/4.8/bits/stl_pair.h" + "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" + "/usr/include/c++/4.8/bits/stl_relops.h" + "/usr/include/c++/4.8/bits/stl_set.h" + "/usr/include/c++/4.8/bits/stl_tempbuf.h" + "/usr/include/c++/4.8/bits/stl_tree.h" + "/usr/include/c++/4.8/bits/stl_uninitialized.h" + "/usr/include/c++/4.8/bits/stl_vector.h" + "/usr/include/c++/4.8/bits/streambuf.tcc" + "/usr/include/c++/4.8/bits/streambuf_iterator.h" + "/usr/include/c++/4.8/bits/stringfwd.h" + "/usr/include/c++/4.8/bits/vector.tcc" + "/usr/include/c++/4.8/cctype" + "/usr/include/c++/4.8/climits" + "/usr/include/c++/4.8/clocale" + "/usr/include/c++/4.8/cmath" + "/usr/include/c++/4.8/cstddef" + "/usr/include/c++/4.8/cstdio" + "/usr/include/c++/4.8/cstdlib" + "/usr/include/c++/4.8/cstring" + "/usr/include/c++/4.8/cwchar" + "/usr/include/c++/4.8/cwctype" + "/usr/include/c++/4.8/cxxabi.h" + "/usr/include/c++/4.8/debug/debug.h" + "/usr/include/c++/4.8/exception" + "/usr/include/c++/4.8/ext/alloc_traits.h" + "/usr/include/c++/4.8/ext/atomicity.h" + "/usr/include/c++/4.8/ext/new_allocator.h" + "/usr/include/c++/4.8/ext/numeric_traits.h" + "/usr/include/c++/4.8/ext/type_traits.h" + "/usr/include/c++/4.8/fstream" + "/usr/include/c++/4.8/functional" + "/usr/include/c++/4.8/ios" + "/usr/include/c++/4.8/iosfwd" + "/usr/include/c++/4.8/iostream" + "/usr/include/c++/4.8/istream" + "/usr/include/c++/4.8/map" + "/usr/include/c++/4.8/memory" + "/usr/include/c++/4.8/new" + "/usr/include/c++/4.8/ostream" + "/usr/include/c++/4.8/set" + "/usr/include/c++/4.8/sstream" + "/usr/include/c++/4.8/streambuf" + "/usr/include/c++/4.8/string" + "/usr/include/c++/4.8/typeinfo" + "/usr/include/c++/4.8/utility" + "/usr/include/c++/4.8/vector" + "/usr/include/ctype.h" + "/usr/include/endian.h" + "/usr/include/errno.h" + "/usr/include/features.h" + "/usr/include/getopt.h" + "/usr/include/inttypes.h" + "/usr/include/libio.h" + "/usr/include/limits.h" + "/usr/include/linux/errno.h" + "/usr/include/linux/limits.h" + "/usr/include/locale.h" + "/usr/include/math.h" + "/usr/include/pthread.h" + "/usr/include/sched.h" + "/usr/include/stdc-predef.h" + "/usr/include/stdint.h" + "/usr/include/stdio.h" + "/usr/include/stdlib.h" + "/usr/include/string.h" + "/usr/include/time.h" + "/usr/include/unistd.h" + "/usr/include/wchar.h" + "/usr/include/wctype.h" + "/usr/include/x86_64-linux-gnu/asm/errno.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap.h" + "/usr/include/x86_64-linux-gnu/bits/confname.h" + "/usr/include/x86_64-linux-gnu/bits/endian.h" + "/usr/include/x86_64-linux-gnu/bits/environments.h" + "/usr/include/x86_64-linux-gnu/bits/errno.h" + "/usr/include/x86_64-linux-gnu/bits/huge_val.h" + "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" + "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" + "/usr/include/x86_64-linux-gnu/bits/inf.h" + "/usr/include/x86_64-linux-gnu/bits/local_lim.h" + "/usr/include/x86_64-linux-gnu/bits/locale.h" + "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" + "/usr/include/x86_64-linux-gnu/bits/mathdef.h" + "/usr/include/x86_64-linux-gnu/bits/mathinline.h" + "/usr/include/x86_64-linux-gnu/bits/nan.h" + "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" + "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" + "/usr/include/x86_64-linux-gnu/bits/sched.h" + "/usr/include/x86_64-linux-gnu/bits/select.h" + "/usr/include/x86_64-linux-gnu/bits/select2.h" + "/usr/include/x86_64-linux-gnu/bits/setjmp.h" + "/usr/include/x86_64-linux-gnu/bits/sigset.h" + "/usr/include/x86_64-linux-gnu/bits/stdio.h" + "/usr/include/x86_64-linux-gnu/bits/stdio2.h" + "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib.h" + "/usr/include/x86_64-linux-gnu/bits/string3.h" + "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" + "/usr/include/x86_64-linux-gnu/bits/time.h" + "/usr/include/x86_64-linux-gnu/bits/timex.h" + "/usr/include/x86_64-linux-gnu/bits/types.h" + "/usr/include/x86_64-linux-gnu/bits/typesizes.h" + "/usr/include/x86_64-linux-gnu/bits/unistd.h" + "/usr/include/x86_64-linux-gnu/bits/waitflags.h" + "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" + "/usr/include/x86_64-linux-gnu/bits/wchar.h" + "/usr/include/x86_64-linux-gnu/bits/wchar2.h" + "/usr/include/x86_64-linux-gnu/bits/wordsize.h" + "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs.h" + "/usr/include/x86_64-linux-gnu/sys/cdefs.h" + "/usr/include/x86_64-linux-gnu/sys/select.h" + "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" + "/usr/include/x86_64-linux-gnu/sys/types.h" + "/usr/include/xlocale.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" + "/usr/local/cuda-6.5/include/CL/cl.h" + "/usr/local/cuda-6.5/include/CL/cl_ext.h" + "/usr/local/cuda-6.5/include/CL/cl_platform.h" + "/usr/local/cuda-6.5/include/builtin_types.h" + "/usr/local/cuda-6.5/include/channel_descriptor.h" + "/usr/local/cuda-6.5/include/common_functions.h" + "/usr/local/cuda-6.5/include/cuComplex.h" + "/usr/local/cuda-6.5/include/cublas_api.h" + "/usr/local/cuda-6.5/include/cublas_v2.h" + "/usr/local/cuda-6.5/include/cuda.h" + "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_runtime.h" + "/usr/local/cuda-6.5/include/cuda_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_surface_types.h" + "/usr/local/cuda-6.5/include/cuda_texture_types.h" + "/usr/local/cuda-6.5/include/curand.h" + "/usr/local/cuda-6.5/include/device_functions.h" + "/usr/local/cuda-6.5/include/device_launch_parameters.h" + "/usr/local/cuda-6.5/include/device_types.h" + "/usr/local/cuda-6.5/include/driver_functions.h" + "/usr/local/cuda-6.5/include/driver_types.h" + "/usr/local/cuda-6.5/include/host_config.h" + "/usr/local/cuda-6.5/include/host_defines.h" + "/usr/local/cuda-6.5/include/math_functions.h" + "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" + "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_13_double_functions.h" + "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" + "/usr/local/cuda-6.5/include/surface_functions.h" + "/usr/local/cuda-6.5/include/surface_indirect_functions.h" + "/usr/local/cuda-6.5/include/surface_types.h" + "/usr/local/cuda-6.5/include/texture_fetch_functions.h" + "/usr/local/cuda-6.5/include/texture_indirect_functions.h" + "/usr/local/cuda-6.5/include/texture_types.h" + "/usr/local/cuda-6.5/include/vector_functions.h" + "/usr/local/cuda-6.5/include/vector_types.h" + "/usr/local/include/boost/assert.hpp" + "/usr/local/include/boost/checked_delete.hpp" + "/usr/local/include/boost/config.hpp" + "/usr/local/include/boost/config/compiler/gcc.hpp" + "/usr/local/include/boost/config/compiler/nvcc.hpp" + "/usr/local/include/boost/config/no_tr1/memory.hpp" + "/usr/local/include/boost/config/no_tr1/utility.hpp" + "/usr/local/include/boost/config/platform/linux.hpp" + "/usr/local/include/boost/config/posix_features.hpp" + "/usr/local/include/boost/config/select_compiler_config.hpp" + "/usr/local/include/boost/config/select_platform_config.hpp" + "/usr/local/include/boost/config/select_stdlib_config.hpp" + "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" + "/usr/local/include/boost/config/suffix.hpp" + "/usr/local/include/boost/config/user.hpp" + "/usr/local/include/boost/core/checked_delete.hpp" + "/usr/local/include/boost/core/demangle.hpp" + "/usr/local/include/boost/core/typeinfo.hpp" + "/usr/local/include/boost/current_function.hpp" + "/usr/local/include/boost/detail/sp_typeinfo.hpp" + "/usr/local/include/boost/detail/workaround.hpp" + "/usr/local/include/boost/exception/exception.hpp" + "/usr/local/include/boost/predef.h" + "/usr/local/include/boost/predef/architecture.h" + "/usr/local/include/boost/predef/architecture/alpha.h" + "/usr/local/include/boost/predef/architecture/arm.h" + "/usr/local/include/boost/predef/architecture/blackfin.h" + "/usr/local/include/boost/predef/architecture/convex.h" + "/usr/local/include/boost/predef/architecture/ia64.h" + "/usr/local/include/boost/predef/architecture/m68k.h" + "/usr/local/include/boost/predef/architecture/mips.h" + "/usr/local/include/boost/predef/architecture/parisc.h" + "/usr/local/include/boost/predef/architecture/ppc.h" + "/usr/local/include/boost/predef/architecture/pyramid.h" + "/usr/local/include/boost/predef/architecture/rs6k.h" + "/usr/local/include/boost/predef/architecture/sparc.h" + "/usr/local/include/boost/predef/architecture/superh.h" + "/usr/local/include/boost/predef/architecture/sys370.h" + "/usr/local/include/boost/predef/architecture/sys390.h" + "/usr/local/include/boost/predef/architecture/x86.h" + "/usr/local/include/boost/predef/architecture/x86/32.h" + "/usr/local/include/boost/predef/architecture/x86/64.h" + "/usr/local/include/boost/predef/architecture/z.h" + "/usr/local/include/boost/predef/compiler.h" + "/usr/local/include/boost/predef/compiler/borland.h" + "/usr/local/include/boost/predef/compiler/clang.h" + "/usr/local/include/boost/predef/compiler/comeau.h" + "/usr/local/include/boost/predef/compiler/compaq.h" + "/usr/local/include/boost/predef/compiler/diab.h" + "/usr/local/include/boost/predef/compiler/digitalmars.h" + "/usr/local/include/boost/predef/compiler/dignus.h" + "/usr/local/include/boost/predef/compiler/edg.h" + "/usr/local/include/boost/predef/compiler/ekopath.h" + "/usr/local/include/boost/predef/compiler/gcc.h" + "/usr/local/include/boost/predef/compiler/gcc_xml.h" + "/usr/local/include/boost/predef/compiler/greenhills.h" + "/usr/local/include/boost/predef/compiler/hp_acc.h" + "/usr/local/include/boost/predef/compiler/iar.h" + "/usr/local/include/boost/predef/compiler/ibm.h" + "/usr/local/include/boost/predef/compiler/intel.h" + "/usr/local/include/boost/predef/compiler/kai.h" + "/usr/local/include/boost/predef/compiler/llvm.h" + "/usr/local/include/boost/predef/compiler/metaware.h" + "/usr/local/include/boost/predef/compiler/metrowerks.h" + "/usr/local/include/boost/predef/compiler/microtec.h" + "/usr/local/include/boost/predef/compiler/mpw.h" + "/usr/local/include/boost/predef/compiler/palm.h" + "/usr/local/include/boost/predef/compiler/pgi.h" + "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" + "/usr/local/include/boost/predef/compiler/sunpro.h" + "/usr/local/include/boost/predef/compiler/tendra.h" + "/usr/local/include/boost/predef/compiler/visualc.h" + "/usr/local/include/boost/predef/compiler/watcom.h" + "/usr/local/include/boost/predef/detail/_cassert.h" + "/usr/local/include/boost/predef/detail/_exception.h" + "/usr/local/include/boost/predef/detail/comp_detected.h" + "/usr/local/include/boost/predef/detail/os_detected.h" + "/usr/local/include/boost/predef/detail/test.h" + "/usr/local/include/boost/predef/language.h" + "/usr/local/include/boost/predef/language/objc.h" + "/usr/local/include/boost/predef/language/stdc.h" + "/usr/local/include/boost/predef/language/stdcpp.h" + "/usr/local/include/boost/predef/library.h" + "/usr/local/include/boost/predef/library/c.h" + "/usr/local/include/boost/predef/library/c/_prefix.h" + "/usr/local/include/boost/predef/library/c/gnu.h" + "/usr/local/include/boost/predef/library/c/uc.h" + "/usr/local/include/boost/predef/library/c/vms.h" + "/usr/local/include/boost/predef/library/c/zos.h" + "/usr/local/include/boost/predef/library/std.h" + "/usr/local/include/boost/predef/library/std/_prefix.h" + "/usr/local/include/boost/predef/library/std/cxx.h" + "/usr/local/include/boost/predef/library/std/dinkumware.h" + "/usr/local/include/boost/predef/library/std/libcomo.h" + "/usr/local/include/boost/predef/library/std/modena.h" + "/usr/local/include/boost/predef/library/std/msl.h" + "/usr/local/include/boost/predef/library/std/roguewave.h" + "/usr/local/include/boost/predef/library/std/sgi.h" + "/usr/local/include/boost/predef/library/std/stdcpp3.h" + "/usr/local/include/boost/predef/library/std/stlport.h" + "/usr/local/include/boost/predef/library/std/vacpp.h" + "/usr/local/include/boost/predef/make.h" + "/usr/local/include/boost/predef/os.h" + "/usr/local/include/boost/predef/os/aix.h" + "/usr/local/include/boost/predef/os/amigaos.h" + "/usr/local/include/boost/predef/os/android.h" + "/usr/local/include/boost/predef/os/beos.h" + "/usr/local/include/boost/predef/os/bsd.h" + "/usr/local/include/boost/predef/os/bsd/bsdi.h" + "/usr/local/include/boost/predef/os/bsd/dragonfly.h" + "/usr/local/include/boost/predef/os/bsd/free.h" + "/usr/local/include/boost/predef/os/bsd/net.h" + "/usr/local/include/boost/predef/os/bsd/open.h" + "/usr/local/include/boost/predef/os/cygwin.h" + "/usr/local/include/boost/predef/os/hpux.h" + "/usr/local/include/boost/predef/os/ios.h" + "/usr/local/include/boost/predef/os/irix.h" + "/usr/local/include/boost/predef/os/linux.h" + "/usr/local/include/boost/predef/os/macos.h" + "/usr/local/include/boost/predef/os/os400.h" + "/usr/local/include/boost/predef/os/qnxnto.h" + "/usr/local/include/boost/predef/os/solaris.h" + "/usr/local/include/boost/predef/os/unix.h" + "/usr/local/include/boost/predef/os/vms.h" + "/usr/local/include/boost/predef/os/windows.h" + "/usr/local/include/boost/predef/other.h" + "/usr/local/include/boost/predef/other/endian.h" + "/usr/local/include/boost/predef/platform.h" + "/usr/local/include/boost/predef/platform/mingw.h" + "/usr/local/include/boost/predef/platform/windows_desktop.h" + "/usr/local/include/boost/predef/platform/windows_phone.h" + "/usr/local/include/boost/predef/platform/windows_runtime.h" + "/usr/local/include/boost/predef/platform/windows_store.h" + "/usr/local/include/boost/predef/version_number.h" + "/usr/local/include/boost/shared_ptr.hpp" + "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" + "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" + "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" + "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" + "/usr/local/include/boost/throw_exception.hpp" + "/usr/local/include/gflags/gflags.h" + "/usr/local/include/gflags/gflags_declare.h" + "/usr/local/include/glog/log_severity.h" + "/usr/local/include/glog/logging.h" + "/usr/local/include/glog/vlog_is_on.h" +) + diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake new file mode 100644 index 00000000..0bd0d4e9 --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend new file mode 100644 index 00000000..2dfb589a --- /dev/null +++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend @@ -0,0 +1,744 @@ +# Generated by: make2cmake.cmake +SET(CUDA_NVCC_DEPEND + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu" + "/opt/clBLAS-private-april8/include/clBLAS-complex.h" + "/opt/clBLAS-private-april8/include/clBLAS.h" + "/usr/include/_G_config.h" + "/usr/include/alloca.h" + "/usr/include/asm-generic/errno-base.h" + "/usr/include/asm-generic/errno.h" + "/usr/include/assert.h" + "/usr/include/atlas/cblas.h" + "/usr/include/c++/4.8/algorithm" + "/usr/include/c++/4.8/backward/auto_ptr.h" + "/usr/include/c++/4.8/backward/binders.h" + "/usr/include/c++/4.8/bits/algorithmfwd.h" + "/usr/include/c++/4.8/bits/allocator.h" + "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" + "/usr/include/c++/4.8/bits/basic_ios.h" + "/usr/include/c++/4.8/bits/basic_ios.tcc" + "/usr/include/c++/4.8/bits/basic_string.h" + "/usr/include/c++/4.8/bits/basic_string.tcc" + "/usr/include/c++/4.8/bits/char_traits.h" + "/usr/include/c++/4.8/bits/codecvt.h" + "/usr/include/c++/4.8/bits/concept_check.h" + "/usr/include/c++/4.8/bits/cpp_type_traits.h" + "/usr/include/c++/4.8/bits/cxxabi_forced.h" + "/usr/include/c++/4.8/bits/exception_defines.h" + "/usr/include/c++/4.8/bits/fstream.tcc" + "/usr/include/c++/4.8/bits/functexcept.h" + "/usr/include/c++/4.8/bits/ios_base.h" + "/usr/include/c++/4.8/bits/istream.tcc" + "/usr/include/c++/4.8/bits/locale_classes.h" + "/usr/include/c++/4.8/bits/locale_classes.tcc" + "/usr/include/c++/4.8/bits/locale_facets.h" + "/usr/include/c++/4.8/bits/locale_facets.tcc" + "/usr/include/c++/4.8/bits/localefwd.h" + "/usr/include/c++/4.8/bits/memoryfwd.h" + "/usr/include/c++/4.8/bits/move.h" + "/usr/include/c++/4.8/bits/ostream.tcc" + "/usr/include/c++/4.8/bits/ostream_insert.h" + "/usr/include/c++/4.8/bits/postypes.h" + "/usr/include/c++/4.8/bits/range_access.h" + "/usr/include/c++/4.8/bits/sstream.tcc" + "/usr/include/c++/4.8/bits/stl_algo.h" + "/usr/include/c++/4.8/bits/stl_algobase.h" + "/usr/include/c++/4.8/bits/stl_bvector.h" + "/usr/include/c++/4.8/bits/stl_construct.h" + "/usr/include/c++/4.8/bits/stl_function.h" + "/usr/include/c++/4.8/bits/stl_heap.h" + "/usr/include/c++/4.8/bits/stl_iterator.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" + "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" + "/usr/include/c++/4.8/bits/stl_map.h" + "/usr/include/c++/4.8/bits/stl_multimap.h" + "/usr/include/c++/4.8/bits/stl_multiset.h" + "/usr/include/c++/4.8/bits/stl_pair.h" + "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" + "/usr/include/c++/4.8/bits/stl_relops.h" + "/usr/include/c++/4.8/bits/stl_set.h" + "/usr/include/c++/4.8/bits/stl_tempbuf.h" + "/usr/include/c++/4.8/bits/stl_tree.h" + "/usr/include/c++/4.8/bits/stl_uninitialized.h" + "/usr/include/c++/4.8/bits/stl_vector.h" + "/usr/include/c++/4.8/bits/stream_iterator.h" + "/usr/include/c++/4.8/bits/streambuf.tcc" + "/usr/include/c++/4.8/bits/streambuf_iterator.h" + "/usr/include/c++/4.8/bits/stringfwd.h" + "/usr/include/c++/4.8/bits/vector.tcc" + "/usr/include/c++/4.8/cctype" + "/usr/include/c++/4.8/climits" + "/usr/include/c++/4.8/clocale" + "/usr/include/c++/4.8/cmath" + "/usr/include/c++/4.8/cstddef" + "/usr/include/c++/4.8/cstdio" + "/usr/include/c++/4.8/cstdlib" + "/usr/include/c++/4.8/cstring" + "/usr/include/c++/4.8/cwchar" + "/usr/include/c++/4.8/cwctype" + "/usr/include/c++/4.8/cxxabi.h" + "/usr/include/c++/4.8/debug/debug.h" + "/usr/include/c++/4.8/exception" + "/usr/include/c++/4.8/ext/alloc_traits.h" + "/usr/include/c++/4.8/ext/atomicity.h" + "/usr/include/c++/4.8/ext/new_allocator.h" + "/usr/include/c++/4.8/ext/numeric_traits.h" + "/usr/include/c++/4.8/ext/type_traits.h" + "/usr/include/c++/4.8/fstream" + "/usr/include/c++/4.8/functional" + "/usr/include/c++/4.8/ios" + "/usr/include/c++/4.8/iosfwd" + "/usr/include/c++/4.8/iostream" + "/usr/include/c++/4.8/istream" + "/usr/include/c++/4.8/iterator" + "/usr/include/c++/4.8/limits" + "/usr/include/c++/4.8/map" + "/usr/include/c++/4.8/memory" + "/usr/include/c++/4.8/new" + "/usr/include/c++/4.8/ostream" + "/usr/include/c++/4.8/set" + "/usr/include/c++/4.8/sstream" + "/usr/include/c++/4.8/stdexcept" + "/usr/include/c++/4.8/streambuf" + "/usr/include/c++/4.8/string" + "/usr/include/c++/4.8/typeinfo" + "/usr/include/c++/4.8/utility" + "/usr/include/c++/4.8/vector" + "/usr/include/ctype.h" + "/usr/include/endian.h" + "/usr/include/errno.h" + "/usr/include/features.h" + "/usr/include/getopt.h" + "/usr/include/inttypes.h" + "/usr/include/libio.h" + "/usr/include/limits.h" + "/usr/include/linux/errno.h" + "/usr/include/linux/limits.h" + "/usr/include/locale.h" + "/usr/include/math.h" + "/usr/include/pthread.h" + "/usr/include/sched.h" + "/usr/include/stdc-predef.h" + "/usr/include/stdint.h" + "/usr/include/stdio.h" + "/usr/include/stdlib.h" + "/usr/include/string.h" + "/usr/include/time.h" + "/usr/include/unistd.h" + "/usr/include/wchar.h" + "/usr/include/wctype.h" + "/usr/include/x86_64-linux-gnu/asm/errno.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" + "/usr/include/x86_64-linux-gnu/bits/byteswap.h" + "/usr/include/x86_64-linux-gnu/bits/confname.h" + "/usr/include/x86_64-linux-gnu/bits/endian.h" + "/usr/include/x86_64-linux-gnu/bits/environments.h" + "/usr/include/x86_64-linux-gnu/bits/errno.h" + "/usr/include/x86_64-linux-gnu/bits/huge_val.h" + "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" + "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" + "/usr/include/x86_64-linux-gnu/bits/inf.h" + "/usr/include/x86_64-linux-gnu/bits/local_lim.h" + "/usr/include/x86_64-linux-gnu/bits/locale.h" + "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" + "/usr/include/x86_64-linux-gnu/bits/mathdef.h" + "/usr/include/x86_64-linux-gnu/bits/mathinline.h" + "/usr/include/x86_64-linux-gnu/bits/nan.h" + "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" + "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" + "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" + "/usr/include/x86_64-linux-gnu/bits/sched.h" + "/usr/include/x86_64-linux-gnu/bits/select.h" + "/usr/include/x86_64-linux-gnu/bits/select2.h" + "/usr/include/x86_64-linux-gnu/bits/setjmp.h" + "/usr/include/x86_64-linux-gnu/bits/sigset.h" + "/usr/include/x86_64-linux-gnu/bits/stdio.h" + "/usr/include/x86_64-linux-gnu/bits/stdio2.h" + "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" + "/usr/include/x86_64-linux-gnu/bits/stdlib.h" + "/usr/include/x86_64-linux-gnu/bits/string3.h" + "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" + "/usr/include/x86_64-linux-gnu/bits/time.h" + "/usr/include/x86_64-linux-gnu/bits/timex.h" + "/usr/include/x86_64-linux-gnu/bits/types.h" + "/usr/include/x86_64-linux-gnu/bits/typesizes.h" + "/usr/include/x86_64-linux-gnu/bits/unistd.h" + "/usr/include/x86_64-linux-gnu/bits/waitflags.h" + "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" + "/usr/include/x86_64-linux-gnu/bits/wchar.h" + "/usr/include/x86_64-linux-gnu/bits/wchar2.h" + "/usr/include/x86_64-linux-gnu/bits/wordsize.h" + "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" + "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" + "/usr/include/x86_64-linux-gnu/gnu/stubs.h" + "/usr/include/x86_64-linux-gnu/sys/cdefs.h" + "/usr/include/x86_64-linux-gnu/sys/select.h" + "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" + "/usr/include/x86_64-linux-gnu/sys/types.h" + "/usr/include/xlocale.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" + "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" + "/usr/local/cuda-6.5/include/CL/cl.h" + "/usr/local/cuda-6.5/include/CL/cl_ext.h" + "/usr/local/cuda-6.5/include/CL/cl_platform.h" + "/usr/local/cuda-6.5/include/builtin_types.h" + "/usr/local/cuda-6.5/include/channel_descriptor.h" + "/usr/local/cuda-6.5/include/common_functions.h" + "/usr/local/cuda-6.5/include/cuComplex.h" + "/usr/local/cuda-6.5/include/cublas_api.h" + "/usr/local/cuda-6.5/include/cublas_v2.h" + "/usr/local/cuda-6.5/include/cuda.h" + "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_runtime.h" + "/usr/local/cuda-6.5/include/cuda_runtime_api.h" + "/usr/local/cuda-6.5/include/cuda_surface_types.h" + "/usr/local/cuda-6.5/include/cuda_texture_types.h" + "/usr/local/cuda-6.5/include/curand.h" + "/usr/local/cuda-6.5/include/device_functions.h" + "/usr/local/cuda-6.5/include/device_launch_parameters.h" + "/usr/local/cuda-6.5/include/device_types.h" + "/usr/local/cuda-6.5/include/driver_functions.h" + "/usr/local/cuda-6.5/include/driver_types.h" + "/usr/local/cuda-6.5/include/host_config.h" + "/usr/local/cuda-6.5/include/host_defines.h" + "/usr/local/cuda-6.5/include/math_functions.h" + "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" + "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_13_double_functions.h" + "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" + "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" + "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" + "/usr/local/cuda-6.5/include/surface_functions.h" + "/usr/local/cuda-6.5/include/surface_indirect_functions.h" + "/usr/local/cuda-6.5/include/surface_types.h" + "/usr/local/cuda-6.5/include/texture_fetch_functions.h" + "/usr/local/cuda-6.5/include/texture_indirect_functions.h" + "/usr/local/cuda-6.5/include/texture_types.h" + "/usr/local/cuda-6.5/include/thrust/advance.h" + "/usr/local/cuda-6.5/include/thrust/detail/advance.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/no_throw_allocator.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.inl" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.h" + "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.inl" + "/usr/local/cuda-6.5/include/thrust/detail/config.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/compiler.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/compiler_fence.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/config.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/debug.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/device_system.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/forceinline.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/global_workarounds.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/hd_warning_disable.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/host_device.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/host_system.h" + "/usr/local/cuda-6.5/include/thrust/detail/config/simple_defines.h" + "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.h" + "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.inl" + "/usr/local/cuda-6.5/include/thrust/detail/copy.h" + "/usr/local/cuda-6.5/include/thrust/detail/copy.inl" + "/usr/local/cuda-6.5/include/thrust/detail/cstdint.h" + "/usr/local/cuda-6.5/include/thrust/detail/device_free.inl" + "/usr/local/cuda-6.5/include/thrust/detail/device_malloc.inl" + "/usr/local/cuda-6.5/include/thrust/detail/device_ptr.inl" + "/usr/local/cuda-6.5/include/thrust/detail/device_reference.inl" + "/usr/local/cuda-6.5/include/thrust/detail/device_vector.inl" + "/usr/local/cuda-6.5/include/thrust/detail/dispatch/is_trivial_copy.h" + "/usr/local/cuda-6.5/include/thrust/detail/distance.inl" + "/usr/local/cuda-6.5/include/thrust/detail/equal.inl" + "/usr/local/cuda-6.5/include/thrust/detail/execution_policy.h" + "/usr/local/cuda-6.5/include/thrust/detail/extrema.inl" + "/usr/local/cuda-6.5/include/thrust/detail/fill.inl" + "/usr/local/cuda-6.5/include/thrust/detail/find.inl" + "/usr/local/cuda-6.5/include/thrust/detail/for_each.inl" + "/usr/local/cuda-6.5/include/thrust/detail/function.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional.inl" + "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.inl" + "/usr/local/cuda-6.5/include/thrust/detail/functional/argument.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/composite.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/arithmetic_operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/assignment_operator.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/bitwise_operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/compound_assignment_operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/logical_operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/operator_adaptors.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/relational_operators.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/placeholder.h" + "/usr/local/cuda-6.5/include/thrust/detail/functional/value.h" + "/usr/local/cuda-6.5/include/thrust/detail/generate.inl" + "/usr/local/cuda-6.5/include/thrust/detail/host_vector.inl" + "/usr/local/cuda-6.5/include/thrust/detail/internal_functional.h" + "/usr/local/cuda-6.5/include/thrust/detail/malloc_and_free.h" + "/usr/local/cuda-6.5/include/thrust/detail/minmax.h" + "/usr/local/cuda-6.5/include/thrust/detail/mismatch.inl" + "/usr/local/cuda-6.5/include/thrust/detail/numeric_traits.h" + "/usr/local/cuda-6.5/include/thrust/detail/overlapped_copy.h" + "/usr/local/cuda-6.5/include/thrust/detail/pair.inl" + "/usr/local/cuda-6.5/include/thrust/detail/pointer.h" + "/usr/local/cuda-6.5/include/thrust/detail/pointer.inl" + "/usr/local/cuda-6.5/include/thrust/detail/raw_pointer_cast.h" + "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.h" + "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.inl" + "/usr/local/cuda-6.5/include/thrust/detail/reduce.inl" + "/usr/local/cuda-6.5/include/thrust/detail/reference.h" + "/usr/local/cuda-6.5/include/thrust/detail/reference.inl" + "/usr/local/cuda-6.5/include/thrust/detail/reference_forward_declaration.h" + "/usr/local/cuda-6.5/include/thrust/detail/replace.inl" + "/usr/local/cuda-6.5/include/thrust/detail/scan.inl" + "/usr/local/cuda-6.5/include/thrust/detail/scatter.inl" + "/usr/local/cuda-6.5/include/thrust/detail/static_assert.h" + "/usr/local/cuda-6.5/include/thrust/detail/swap.h" + "/usr/local/cuda-6.5/include/thrust/detail/swap.inl" + "/usr/local/cuda-6.5/include/thrust/detail/swap_ranges.inl" + "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.h" + "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.inl" + "/usr/local/cuda-6.5/include/thrust/detail/temporary_buffer.h" + "/usr/local/cuda-6.5/include/thrust/detail/transform.inl" + "/usr/local/cuda-6.5/include/thrust/detail/transform_reduce.inl" + "/usr/local/cuda-6.5/include/thrust/detail/tuple.inl" + "/usr/local/cuda-6.5/include/thrust/detail/tuple_meta_transform.h" + "/usr/local/cuda-6.5/include/thrust/detail/tuple_transform.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/function_traits.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_member_function.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_nested_type.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_trivial_assign.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_call_possible.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_metafunction_defined.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_output_iterator.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/minimum_type.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/pointer_traits.h" + "/usr/local/cuda-6.5/include/thrust/detail/type_traits/result_of.h" + "/usr/local/cuda-6.5/include/thrust/detail/uninitialized_fill.inl" + "/usr/local/cuda-6.5/include/thrust/detail/use_default.h" + "/usr/local/cuda-6.5/include/thrust/detail/util/align.h" + "/usr/local/cuda-6.5/include/thrust/detail/util/blocking.h" + "/usr/local/cuda-6.5/include/thrust/detail/vector_base.h" + "/usr/local/cuda-6.5/include/thrust/detail/vector_base.inl" + "/usr/local/cuda-6.5/include/thrust/device_free.h" + "/usr/local/cuda-6.5/include/thrust/device_malloc.h" + "/usr/local/cuda-6.5/include/thrust/device_malloc_allocator.h" + "/usr/local/cuda-6.5/include/thrust/device_ptr.h" + "/usr/local/cuda-6.5/include/thrust/device_reference.h" + "/usr/local/cuda-6.5/include/thrust/device_vector.h" + "/usr/local/cuda-6.5/include/thrust/distance.h" + "/usr/local/cuda-6.5/include/thrust/equal.h" + "/usr/local/cuda-6.5/include/thrust/extrema.h" + "/usr/local/cuda-6.5/include/thrust/fill.h" + "/usr/local/cuda-6.5/include/thrust/find.h" + "/usr/local/cuda-6.5/include/thrust/for_each.h" + "/usr/local/cuda-6.5/include/thrust/functional.h" + "/usr/local/cuda-6.5/include/thrust/generate.h" + "/usr/local/cuda-6.5/include/thrust/host_vector.h" + "/usr/local/cuda-6.5/include/thrust/iterator/counting_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_assign.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_system_tag.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/counting_iterator.inl" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/device_system_tag.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/discard_iterator_base.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/distance_from_result.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/host_system_tag.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_iterator_category.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_trivial_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_adaptor_base.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_system.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_traversal.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_facade_category.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traits.inl" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traversal_tags.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_category.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_system.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/normal_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/permutation_iterator_base.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator.inl" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator_base.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/tagged_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/transform_iterator.inl" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/tuple_of_iterator_references.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/universal_categories.h" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator.inl" + "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator_base.h" + "/usr/local/cuda-6.5/include/thrust/iterator/discard_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/iterator_adaptor.h" + "/usr/local/cuda-6.5/include/thrust/iterator/iterator_categories.h" + "/usr/local/cuda-6.5/include/thrust/iterator/iterator_facade.h" + "/usr/local/cuda-6.5/include/thrust/iterator/iterator_traits.h" + "/usr/local/cuda-6.5/include/thrust/iterator/permutation_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/reverse_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/transform_iterator.h" + "/usr/local/cuda-6.5/include/thrust/iterator/zip_iterator.h" + "/usr/local/cuda-6.5/include/thrust/memory.h" + "/usr/local/cuda-6.5/include/thrust/mismatch.h" + "/usr/local/cuda-6.5/include/thrust/pair.h" + "/usr/local/cuda-6.5/include/thrust/reduce.h" + "/usr/local/cuda-6.5/include/thrust/replace.h" + "/usr/local/cuda-6.5/include/thrust/scan.h" + "/usr/local/cuda-6.5/include/thrust/scatter.h" + "/usr/local/cuda-6.5/include/thrust/swap.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/assign_value.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/copy.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/execution_policy.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/extrema.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/find.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/for_each.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/generate.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/get_value.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/iter_swap.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/malloc_and_free.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/swap_ranges.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/temporary_buffer.h" + "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/transform.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/assign_value.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/inclusive_scan.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/cuda_launch_config.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/alignment.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/uninitialized.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/error.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/execution_policy.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/extern_shared_ptr.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/get_value.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/iter_swap.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/malloc_and_free.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/swap_ranges.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/transform.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.h" + "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.inl" + "/usr/local/cuda-6.5/include/thrust/system/cuda/error.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/assign_value.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/copy.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/equal.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/extrema.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/fill.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/find.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/for_each.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/generate.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/get_value.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/iter_swap.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/malloc_and_free.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/mismatch.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/replace.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scatter.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/swap_ranges.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/temporary_buffer.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform_reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/adl/uninitialized_fill.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/bad_alloc.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/errno.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/error_category.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/error_code.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/error_condition.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/fill.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/for_each.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/select_system.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/tag.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/type_traits.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/decompose.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.inl" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/extrema.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/find.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/for_each.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/general_copy.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan_by_key.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/trivial_copy.h" + "/usr/local/cuda-6.5/include/thrust/system/detail/system_error.inl" + "/usr/local/cuda-6.5/include/thrust/system/error_code.h" + "/usr/local/cuda-6.5/include/thrust/system/system_error.h" + "/usr/local/cuda-6.5/include/thrust/system_error.h" + "/usr/local/cuda-6.5/include/thrust/transform.h" + "/usr/local/cuda-6.5/include/thrust/transform_reduce.h" + "/usr/local/cuda-6.5/include/thrust/tuple.h" + "/usr/local/cuda-6.5/include/thrust/uninitialized_fill.h" + "/usr/local/cuda-6.5/include/vector_functions.h" + "/usr/local/cuda-6.5/include/vector_types.h" + "/usr/local/include/boost/assert.hpp" + "/usr/local/include/boost/checked_delete.hpp" + "/usr/local/include/boost/config.hpp" + "/usr/local/include/boost/config/compiler/gcc.hpp" + "/usr/local/include/boost/config/compiler/nvcc.hpp" + "/usr/local/include/boost/config/no_tr1/memory.hpp" + "/usr/local/include/boost/config/no_tr1/utility.hpp" + "/usr/local/include/boost/config/platform/linux.hpp" + "/usr/local/include/boost/config/posix_features.hpp" + "/usr/local/include/boost/config/select_compiler_config.hpp" + "/usr/local/include/boost/config/select_platform_config.hpp" + "/usr/local/include/boost/config/select_stdlib_config.hpp" + "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" + "/usr/local/include/boost/config/suffix.hpp" + "/usr/local/include/boost/config/user.hpp" + "/usr/local/include/boost/core/checked_delete.hpp" + "/usr/local/include/boost/core/demangle.hpp" + "/usr/local/include/boost/core/typeinfo.hpp" + "/usr/local/include/boost/current_function.hpp" + "/usr/local/include/boost/detail/sp_typeinfo.hpp" + "/usr/local/include/boost/detail/workaround.hpp" + "/usr/local/include/boost/exception/exception.hpp" + "/usr/local/include/boost/predef.h" + "/usr/local/include/boost/predef/architecture.h" + "/usr/local/include/boost/predef/architecture/alpha.h" + "/usr/local/include/boost/predef/architecture/arm.h" + "/usr/local/include/boost/predef/architecture/blackfin.h" + "/usr/local/include/boost/predef/architecture/convex.h" + "/usr/local/include/boost/predef/architecture/ia64.h" + "/usr/local/include/boost/predef/architecture/m68k.h" + "/usr/local/include/boost/predef/architecture/mips.h" + "/usr/local/include/boost/predef/architecture/parisc.h" + "/usr/local/include/boost/predef/architecture/ppc.h" + "/usr/local/include/boost/predef/architecture/pyramid.h" + "/usr/local/include/boost/predef/architecture/rs6k.h" + "/usr/local/include/boost/predef/architecture/sparc.h" + "/usr/local/include/boost/predef/architecture/superh.h" + "/usr/local/include/boost/predef/architecture/sys370.h" + "/usr/local/include/boost/predef/architecture/sys390.h" + "/usr/local/include/boost/predef/architecture/x86.h" + "/usr/local/include/boost/predef/architecture/x86/32.h" + "/usr/local/include/boost/predef/architecture/x86/64.h" + "/usr/local/include/boost/predef/architecture/z.h" + "/usr/local/include/boost/predef/compiler.h" + "/usr/local/include/boost/predef/compiler/borland.h" + "/usr/local/include/boost/predef/compiler/clang.h" + "/usr/local/include/boost/predef/compiler/comeau.h" + "/usr/local/include/boost/predef/compiler/compaq.h" + "/usr/local/include/boost/predef/compiler/diab.h" + "/usr/local/include/boost/predef/compiler/digitalmars.h" + "/usr/local/include/boost/predef/compiler/dignus.h" + "/usr/local/include/boost/predef/compiler/edg.h" + "/usr/local/include/boost/predef/compiler/ekopath.h" + "/usr/local/include/boost/predef/compiler/gcc.h" + "/usr/local/include/boost/predef/compiler/gcc_xml.h" + "/usr/local/include/boost/predef/compiler/greenhills.h" + "/usr/local/include/boost/predef/compiler/hp_acc.h" + "/usr/local/include/boost/predef/compiler/iar.h" + "/usr/local/include/boost/predef/compiler/ibm.h" + "/usr/local/include/boost/predef/compiler/intel.h" + "/usr/local/include/boost/predef/compiler/kai.h" + "/usr/local/include/boost/predef/compiler/llvm.h" + "/usr/local/include/boost/predef/compiler/metaware.h" + "/usr/local/include/boost/predef/compiler/metrowerks.h" + "/usr/local/include/boost/predef/compiler/microtec.h" + "/usr/local/include/boost/predef/compiler/mpw.h" + "/usr/local/include/boost/predef/compiler/palm.h" + "/usr/local/include/boost/predef/compiler/pgi.h" + "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" + "/usr/local/include/boost/predef/compiler/sunpro.h" + "/usr/local/include/boost/predef/compiler/tendra.h" + "/usr/local/include/boost/predef/compiler/visualc.h" + "/usr/local/include/boost/predef/compiler/watcom.h" + "/usr/local/include/boost/predef/detail/_cassert.h" + "/usr/local/include/boost/predef/detail/_exception.h" + "/usr/local/include/boost/predef/detail/comp_detected.h" + "/usr/local/include/boost/predef/detail/os_detected.h" + "/usr/local/include/boost/predef/detail/test.h" + "/usr/local/include/boost/predef/language.h" + "/usr/local/include/boost/predef/language/objc.h" + "/usr/local/include/boost/predef/language/stdc.h" + "/usr/local/include/boost/predef/language/stdcpp.h" + "/usr/local/include/boost/predef/library.h" + "/usr/local/include/boost/predef/library/c.h" + "/usr/local/include/boost/predef/library/c/_prefix.h" + "/usr/local/include/boost/predef/library/c/gnu.h" + "/usr/local/include/boost/predef/library/c/uc.h" + "/usr/local/include/boost/predef/library/c/vms.h" + "/usr/local/include/boost/predef/library/c/zos.h" + "/usr/local/include/boost/predef/library/std.h" + "/usr/local/include/boost/predef/library/std/_prefix.h" + "/usr/local/include/boost/predef/library/std/cxx.h" + "/usr/local/include/boost/predef/library/std/dinkumware.h" + "/usr/local/include/boost/predef/library/std/libcomo.h" + "/usr/local/include/boost/predef/library/std/modena.h" + "/usr/local/include/boost/predef/library/std/msl.h" + "/usr/local/include/boost/predef/library/std/roguewave.h" + "/usr/local/include/boost/predef/library/std/sgi.h" + "/usr/local/include/boost/predef/library/std/stdcpp3.h" + "/usr/local/include/boost/predef/library/std/stlport.h" + "/usr/local/include/boost/predef/library/std/vacpp.h" + "/usr/local/include/boost/predef/make.h" + "/usr/local/include/boost/predef/os.h" + "/usr/local/include/boost/predef/os/aix.h" + "/usr/local/include/boost/predef/os/amigaos.h" + "/usr/local/include/boost/predef/os/android.h" + "/usr/local/include/boost/predef/os/beos.h" + "/usr/local/include/boost/predef/os/bsd.h" + "/usr/local/include/boost/predef/os/bsd/bsdi.h" + "/usr/local/include/boost/predef/os/bsd/dragonfly.h" + "/usr/local/include/boost/predef/os/bsd/free.h" + "/usr/local/include/boost/predef/os/bsd/net.h" + "/usr/local/include/boost/predef/os/bsd/open.h" + "/usr/local/include/boost/predef/os/cygwin.h" + "/usr/local/include/boost/predef/os/hpux.h" + "/usr/local/include/boost/predef/os/ios.h" + "/usr/local/include/boost/predef/os/irix.h" + "/usr/local/include/boost/predef/os/linux.h" + "/usr/local/include/boost/predef/os/macos.h" + "/usr/local/include/boost/predef/os/os400.h" + "/usr/local/include/boost/predef/os/qnxnto.h" + "/usr/local/include/boost/predef/os/solaris.h" + "/usr/local/include/boost/predef/os/unix.h" + "/usr/local/include/boost/predef/os/vms.h" + "/usr/local/include/boost/predef/os/windows.h" + "/usr/local/include/boost/predef/other.h" + "/usr/local/include/boost/predef/other/endian.h" + "/usr/local/include/boost/predef/platform.h" + "/usr/local/include/boost/predef/platform/mingw.h" + "/usr/local/include/boost/predef/platform/windows_desktop.h" + "/usr/local/include/boost/predef/platform/windows_phone.h" + "/usr/local/include/boost/predef/platform/windows_runtime.h" + "/usr/local/include/boost/predef/platform/windows_store.h" + "/usr/local/include/boost/predef/version_number.h" + "/usr/local/include/boost/shared_ptr.hpp" + "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" + "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" + "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" + "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" + "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" + "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" + "/usr/local/include/boost/throw_exception.hpp" + "/usr/local/include/gflags/gflags.h" + "/usr/local/include/gflags/gflags_declare.h" + "/usr/local/include/glog/log_severity.h" + "/usr/local/include/glog/logging.h" + "/usr/local/include/glog/vlog_is_on.h" +) + diff --git a/src/caffe/CMakeFiles/progress.marks b/src/caffe/CMakeFiles/progress.marks new file mode 100644 index 00000000..abdfb053 --- /dev/null +++ b/src/caffe/CMakeFiles/progress.marks @@ -0,0 +1 @@ +60 diff --git a/src/caffe/CMakeFiles/proto.dir/CXX.includecache b/src/caffe/CMakeFiles/proto.dir/CXX.includecache new file mode 100644 index 00000000..df68b9a9 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/CXX.includecache @@ -0,0 +1,48 @@ +#IncludeRegexLine: ^[ ]*#[ ]*(include|import)[ ]*[<"]([^">]+)([">]) + +#IncludeRegexScan: ^.*$ + +#IncludeRegexComplain: ^$ + +#IncludeRegexTransform: + +/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc +caffe.pb.h +/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h +algorithm +- +google/protobuf/stubs/common.h +- +google/protobuf/stubs/once.h +- +google/protobuf/io/coded_stream.h +- +google/protobuf/wire_format_lite_inl.h +- +google/protobuf/descriptor.h +- +google/protobuf/generated_message_reflection.h +- +google/protobuf/reflection_ops.h +- +google/protobuf/wire_format.h +- + +/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h +string +- +google/protobuf/stubs/common.h +- +google/protobuf/generated_message_util.h +- +google/protobuf/message.h +- +google/protobuf/repeated_field.h +- +google/protobuf/extension_set.h +- +google/protobuf/generated_enum_reflection.h +- +google/protobuf/unknown_field_set.h +- + diff --git a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake new file mode 100644 index 00000000..44c81e52 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake @@ -0,0 +1,39 @@ +# The set of languages for which implicit dependencies are needed: +SET(CMAKE_DEPENDS_LANGUAGES + "CXX" + ) +# The set of files for implicit dependencies of each language: +SET(CMAKE_DEPENDS_CHECK_CXX + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" + ) +SET(CMAKE_CXX_COMPILER_ID "GNU") + +# Preprocessor definitions for this target. +SET(CMAKE_TARGET_DEFINITIONS + "GTEST_USE_OWN_TR1_TUPLE" + ) + +# Pairs of files generated by the same build rule. +SET(CMAKE_MULTIPLE_OUTPUT_PAIRS + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" + ) + + +# Targets to which this target links. +SET(CMAKE_TARGET_LINKED_INFO_FILES + ) + +# The include file search paths: +SET(CMAKE_C_TARGET_INCLUDE_PATH + "src" + "/usr/local/include" + "include" + "/usr/local/cuda/include" + "/usr/local/include/opencv" + "/usr/include/atlas" + "." + ) +SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/CMakeFiles/proto.dir/build.make b/src/caffe/CMakeFiles/proto.dir/build.make new file mode 100644 index 00000000..1467c124 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/build.make @@ -0,0 +1,119 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# Include any dependencies generated for this target. +include src/caffe/CMakeFiles/proto.dir/depend.make + +# Include the progress variables for this target. +include src/caffe/CMakeFiles/proto.dir/progress.make + +# Include the compile flags for this target's objects. +include src/caffe/CMakeFiles/proto.dir/flags.make + +include/caffe/proto/caffe.pb.cc: src/caffe/proto/caffe.proto + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Running C++/Python protocol buffer compiler on /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --cpp_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --python_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto + +include/caffe/proto/caffe.pb.h: include/caffe/proto/caffe.pb.cc + +include/caffe/proto/caffe_pb2.py: include/caffe/proto/caffe.pb.cc + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: src/caffe/CMakeFiles/proto.dir/flags.make +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc > CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires: +.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires + $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build +.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o + +# Object files for target proto +proto_OBJECTS = \ +"CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" + +# External object files for target proto +proto_EXTERNAL_OBJECTS = + +lib/libproto.a: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o +lib/libproto.a: src/caffe/CMakeFiles/proto.dir/build.make +lib/libproto.a: src/caffe/CMakeFiles/proto.dir/link.txt + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libproto.a" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean_target.cmake + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/proto.dir/link.txt --verbose=$(VERBOSE) + +# Rule to build all files generated by this target. +src/caffe/CMakeFiles/proto.dir/build: lib/libproto.a +.PHONY : src/caffe/CMakeFiles/proto.dir/build + +src/caffe/CMakeFiles/proto.dir/requires: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires +.PHONY : src/caffe/CMakeFiles/proto.dir/requires + +src/caffe/CMakeFiles/proto.dir/clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean.cmake +.PHONY : src/caffe/CMakeFiles/proto.dir/clean + +src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.cc +src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.h +src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe_pb2.py + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake --color=$(COLOR) +.PHONY : src/caffe/CMakeFiles/proto.dir/depend + diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake new file mode 100644 index 00000000..79cb425a --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake @@ -0,0 +1,13 @@ +FILE(REMOVE_RECURSE + "../../include/caffe/proto/caffe.pb.cc" + "../../include/caffe/proto/caffe.pb.h" + "../../include/caffe/proto/caffe_pb2.py" + "CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" + "../../lib/libproto.pdb" + "../../lib/libproto.a" +) + +# Per-language clean rules from dependency scanning. +FOREACH(lang CXX) + INCLUDE(CMakeFiles/proto.dir/cmake_clean_${lang}.cmake OPTIONAL) +ENDFOREACH(lang) diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake new file mode 100644 index 00000000..6172b692 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake @@ -0,0 +1,3 @@ +FILE(REMOVE_RECURSE + "../../lib/libproto.a" +) diff --git a/src/caffe/CMakeFiles/proto.dir/depend.internal b/src/caffe/CMakeFiles/proto.dir/depend.internal new file mode 100644 index 00000000..2f8ec677 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/depend.internal @@ -0,0 +1,6 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o + /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc + /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h diff --git a/src/caffe/CMakeFiles/proto.dir/depend.make b/src/caffe/CMakeFiles/proto.dir/depend.make new file mode 100644 index 00000000..239c4242 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/depend.make @@ -0,0 +1,6 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc +src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.h + diff --git a/src/caffe/CMakeFiles/proto.dir/flags.make b/src/caffe/CMakeFiles/proto.dir/flags.make new file mode 100644 index 00000000..8b4ef992 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/flags.make @@ -0,0 +1,8 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# compile CXX with /usr/bin/c++ +CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe + +CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE + diff --git a/src/caffe/CMakeFiles/proto.dir/link.txt b/src/caffe/CMakeFiles/proto.dir/link.txt new file mode 100644 index 00000000..42f85bda --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/link.txt @@ -0,0 +1,2 @@ +/usr/bin/ar cr ../../lib/libproto.a CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o +/usr/bin/ranlib ../../lib/libproto.a diff --git a/src/caffe/CMakeFiles/proto.dir/progress.make b/src/caffe/CMakeFiles/proto.dir/progress.make new file mode 100644 index 00000000..25d32761 --- /dev/null +++ b/src/caffe/CMakeFiles/proto.dir/progress.make @@ -0,0 +1,3 @@ +CMAKE_PROGRESS_1 = 67 +CMAKE_PROGRESS_2 = + diff --git a/src/caffe/Makefile b/src/caffe/Makefile new file mode 100644 index 00000000..fff490de --- /dev/null +++ b/src/caffe/Makefile @@ -0,0 +1,2279 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: install/local +.PHONY : install/local/fast + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: install/strip +.PHONY : install/strip/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/progress.marks + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/all + $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/caffe/CMakeFiles/caffe.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/caffe.dir/rule +.PHONY : src/caffe/CMakeFiles/caffe.dir/rule + +# Convenience name for target. +caffe: src/caffe/CMakeFiles/caffe.dir/rule +.PHONY : caffe + +# fast build rule for target. +caffe/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/build +.PHONY : caffe/fast + +# Convenience name for target. +src/caffe/CMakeFiles/proto.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/proto.dir/rule +.PHONY : src/caffe/CMakeFiles/proto.dir/rule + +# Convenience name for target. +proto: src/caffe/CMakeFiles/proto.dir/rule +.PHONY : proto + +# fast build rule for target. +proto/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/build +.PHONY : proto/fast + +__/__/include/caffe/proto/caffe.pb.o: __/__/include/caffe/proto/caffe.pb.cc.o +.PHONY : __/__/include/caffe/proto/caffe.pb.o + +# target to build an object file +__/__/include/caffe/proto/caffe.pb.cc.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o +.PHONY : __/__/include/caffe/proto/caffe.pb.cc.o + +__/__/include/caffe/proto/caffe.pb.i: __/__/include/caffe/proto/caffe.pb.cc.i +.PHONY : __/__/include/caffe/proto/caffe.pb.i + +# target to preprocess a source file +__/__/include/caffe/proto/caffe.pb.cc.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i +.PHONY : __/__/include/caffe/proto/caffe.pb.cc.i + +__/__/include/caffe/proto/caffe.pb.s: __/__/include/caffe/proto/caffe.pb.cc.s +.PHONY : __/__/include/caffe/proto/caffe.pb.s + +# target to generate assembly for a file +__/__/include/caffe/proto/caffe.pb.cc.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s +.PHONY : __/__/include/caffe/proto/caffe.pb.cc.s + +blob.o: blob.cpp.o +.PHONY : blob.o + +# target to build an object file +blob.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o +.PHONY : blob.cpp.o + +blob.i: blob.cpp.i +.PHONY : blob.i + +# target to preprocess a source file +blob.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.i +.PHONY : blob.cpp.i + +blob.s: blob.cpp.s +.PHONY : blob.s + +# target to generate assembly for a file +blob.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.s +.PHONY : blob.cpp.s + +common.o: common.cpp.o +.PHONY : common.o + +# target to build an object file +common.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o +.PHONY : common.cpp.o + +common.i: common.cpp.i +.PHONY : common.i + +# target to preprocess a source file +common.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.i +.PHONY : common.cpp.i + +common.s: common.cpp.s +.PHONY : common.s + +# target to generate assembly for a file +common.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.s +.PHONY : common.cpp.s + +data_transformer.o: data_transformer.cpp.o +.PHONY : data_transformer.o + +# target to build an object file +data_transformer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o +.PHONY : data_transformer.cpp.o + +data_transformer.i: data_transformer.cpp.i +.PHONY : data_transformer.i + +# target to preprocess a source file +data_transformer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i +.PHONY : data_transformer.cpp.i + +data_transformer.s: data_transformer.cpp.s +.PHONY : data_transformer.s + +# target to generate assembly for a file +data_transformer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s +.PHONY : data_transformer.cpp.s + +device.o: device.cpp.o +.PHONY : device.o + +# target to build an object file +device.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o +.PHONY : device.cpp.o + +device.i: device.cpp.i +.PHONY : device.i + +# target to preprocess a source file +device.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.i +.PHONY : device.cpp.i + +device.s: device.cpp.s +.PHONY : device.s + +# target to generate assembly for a file +device.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.s +.PHONY : device.cpp.s + +internal_thread.o: internal_thread.cpp.o +.PHONY : internal_thread.o + +# target to build an object file +internal_thread.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o +.PHONY : internal_thread.cpp.o + +internal_thread.i: internal_thread.cpp.i +.PHONY : internal_thread.i + +# target to preprocess a source file +internal_thread.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i +.PHONY : internal_thread.cpp.i + +internal_thread.s: internal_thread.cpp.s +.PHONY : internal_thread.s + +# target to generate assembly for a file +internal_thread.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s +.PHONY : internal_thread.cpp.s + +layer_factory.o: layer_factory.cpp.o +.PHONY : layer_factory.o + +# target to build an object file +layer_factory.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o +.PHONY : layer_factory.cpp.o + +layer_factory.i: layer_factory.cpp.i +.PHONY : layer_factory.i + +# target to preprocess a source file +layer_factory.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i +.PHONY : layer_factory.cpp.i + +layer_factory.s: layer_factory.cpp.s +.PHONY : layer_factory.s + +# target to generate assembly for a file +layer_factory.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s +.PHONY : layer_factory.cpp.s + +layers/absval_layer.o: layers/absval_layer.cpp.o +.PHONY : layers/absval_layer.o + +# target to build an object file +layers/absval_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o +.PHONY : layers/absval_layer.cpp.o + +layers/absval_layer.i: layers/absval_layer.cpp.i +.PHONY : layers/absval_layer.i + +# target to preprocess a source file +layers/absval_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i +.PHONY : layers/absval_layer.cpp.i + +layers/absval_layer.s: layers/absval_layer.cpp.s +.PHONY : layers/absval_layer.s + +# target to generate assembly for a file +layers/absval_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s +.PHONY : layers/absval_layer.cpp.s + +layers/accuracy_layer.o: layers/accuracy_layer.cpp.o +.PHONY : layers/accuracy_layer.o + +# target to build an object file +layers/accuracy_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o +.PHONY : layers/accuracy_layer.cpp.o + +layers/accuracy_layer.i: layers/accuracy_layer.cpp.i +.PHONY : layers/accuracy_layer.i + +# target to preprocess a source file +layers/accuracy_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i +.PHONY : layers/accuracy_layer.cpp.i + +layers/accuracy_layer.s: layers/accuracy_layer.cpp.s +.PHONY : layers/accuracy_layer.s + +# target to generate assembly for a file +layers/accuracy_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s +.PHONY : layers/accuracy_layer.cpp.s + +layers/argmax_layer.o: layers/argmax_layer.cpp.o +.PHONY : layers/argmax_layer.o + +# target to build an object file +layers/argmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o +.PHONY : layers/argmax_layer.cpp.o + +layers/argmax_layer.i: layers/argmax_layer.cpp.i +.PHONY : layers/argmax_layer.i + +# target to preprocess a source file +layers/argmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i +.PHONY : layers/argmax_layer.cpp.i + +layers/argmax_layer.s: layers/argmax_layer.cpp.s +.PHONY : layers/argmax_layer.s + +# target to generate assembly for a file +layers/argmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s +.PHONY : layers/argmax_layer.cpp.s + +layers/base_conv_layer.o: layers/base_conv_layer.cpp.o +.PHONY : layers/base_conv_layer.o + +# target to build an object file +layers/base_conv_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o +.PHONY : layers/base_conv_layer.cpp.o + +layers/base_conv_layer.i: layers/base_conv_layer.cpp.i +.PHONY : layers/base_conv_layer.i + +# target to preprocess a source file +layers/base_conv_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i +.PHONY : layers/base_conv_layer.cpp.i + +layers/base_conv_layer.s: layers/base_conv_layer.cpp.s +.PHONY : layers/base_conv_layer.s + +# target to generate assembly for a file +layers/base_conv_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s +.PHONY : layers/base_conv_layer.cpp.s + +layers/base_data_layer.o: layers/base_data_layer.cpp.o +.PHONY : layers/base_data_layer.o + +# target to build an object file +layers/base_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o +.PHONY : layers/base_data_layer.cpp.o + +layers/base_data_layer.i: layers/base_data_layer.cpp.i +.PHONY : layers/base_data_layer.i + +# target to preprocess a source file +layers/base_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i +.PHONY : layers/base_data_layer.cpp.i + +layers/base_data_layer.s: layers/base_data_layer.cpp.s +.PHONY : layers/base_data_layer.s + +# target to generate assembly for a file +layers/base_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s +.PHONY : layers/base_data_layer.cpp.s + +layers/bnll_layer.o: layers/bnll_layer.cpp.o +.PHONY : layers/bnll_layer.o + +# target to build an object file +layers/bnll_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o +.PHONY : layers/bnll_layer.cpp.o + +layers/bnll_layer.i: layers/bnll_layer.cpp.i +.PHONY : layers/bnll_layer.i + +# target to preprocess a source file +layers/bnll_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i +.PHONY : layers/bnll_layer.cpp.i + +layers/bnll_layer.s: layers/bnll_layer.cpp.s +.PHONY : layers/bnll_layer.s + +# target to generate assembly for a file +layers/bnll_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s +.PHONY : layers/bnll_layer.cpp.s + +layers/concat_layer.o: layers/concat_layer.cpp.o +.PHONY : layers/concat_layer.o + +# target to build an object file +layers/concat_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o +.PHONY : layers/concat_layer.cpp.o + +layers/concat_layer.i: layers/concat_layer.cpp.i +.PHONY : layers/concat_layer.i + +# target to preprocess a source file +layers/concat_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i +.PHONY : layers/concat_layer.cpp.i + +layers/concat_layer.s: layers/concat_layer.cpp.s +.PHONY : layers/concat_layer.s + +# target to generate assembly for a file +layers/concat_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s +.PHONY : layers/concat_layer.cpp.s + +layers/contrastive_loss_layer.o: layers/contrastive_loss_layer.cpp.o +.PHONY : layers/contrastive_loss_layer.o + +# target to build an object file +layers/contrastive_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o +.PHONY : layers/contrastive_loss_layer.cpp.o + +layers/contrastive_loss_layer.i: layers/contrastive_loss_layer.cpp.i +.PHONY : layers/contrastive_loss_layer.i + +# target to preprocess a source file +layers/contrastive_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i +.PHONY : layers/contrastive_loss_layer.cpp.i + +layers/contrastive_loss_layer.s: layers/contrastive_loss_layer.cpp.s +.PHONY : layers/contrastive_loss_layer.s + +# target to generate assembly for a file +layers/contrastive_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s +.PHONY : layers/contrastive_loss_layer.cpp.s + +layers/conv_layer.o: layers/conv_layer.cpp.o +.PHONY : layers/conv_layer.o + +# target to build an object file +layers/conv_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o +.PHONY : layers/conv_layer.cpp.o + +layers/conv_layer.i: layers/conv_layer.cpp.i +.PHONY : layers/conv_layer.i + +# target to preprocess a source file +layers/conv_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i +.PHONY : layers/conv_layer.cpp.i + +layers/conv_layer.s: layers/conv_layer.cpp.s +.PHONY : layers/conv_layer.s + +# target to generate assembly for a file +layers/conv_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s +.PHONY : layers/conv_layer.cpp.s + +layers/cudnn_conv_layer.o: layers/cudnn_conv_layer.cpp.o +.PHONY : layers/cudnn_conv_layer.o + +# target to build an object file +layers/cudnn_conv_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o +.PHONY : layers/cudnn_conv_layer.cpp.o + +layers/cudnn_conv_layer.i: layers/cudnn_conv_layer.cpp.i +.PHONY : layers/cudnn_conv_layer.i + +# target to preprocess a source file +layers/cudnn_conv_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i +.PHONY : layers/cudnn_conv_layer.cpp.i + +layers/cudnn_conv_layer.s: layers/cudnn_conv_layer.cpp.s +.PHONY : layers/cudnn_conv_layer.s + +# target to generate assembly for a file +layers/cudnn_conv_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s +.PHONY : layers/cudnn_conv_layer.cpp.s + +layers/cudnn_pooling_layer.o: layers/cudnn_pooling_layer.cpp.o +.PHONY : layers/cudnn_pooling_layer.o + +# target to build an object file +layers/cudnn_pooling_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o +.PHONY : layers/cudnn_pooling_layer.cpp.o + +layers/cudnn_pooling_layer.i: layers/cudnn_pooling_layer.cpp.i +.PHONY : layers/cudnn_pooling_layer.i + +# target to preprocess a source file +layers/cudnn_pooling_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i +.PHONY : layers/cudnn_pooling_layer.cpp.i + +layers/cudnn_pooling_layer.s: layers/cudnn_pooling_layer.cpp.s +.PHONY : layers/cudnn_pooling_layer.s + +# target to generate assembly for a file +layers/cudnn_pooling_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s +.PHONY : layers/cudnn_pooling_layer.cpp.s + +layers/cudnn_relu_layer.o: layers/cudnn_relu_layer.cpp.o +.PHONY : layers/cudnn_relu_layer.o + +# target to build an object file +layers/cudnn_relu_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o +.PHONY : layers/cudnn_relu_layer.cpp.o + +layers/cudnn_relu_layer.i: layers/cudnn_relu_layer.cpp.i +.PHONY : layers/cudnn_relu_layer.i + +# target to preprocess a source file +layers/cudnn_relu_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i +.PHONY : layers/cudnn_relu_layer.cpp.i + +layers/cudnn_relu_layer.s: layers/cudnn_relu_layer.cpp.s +.PHONY : layers/cudnn_relu_layer.s + +# target to generate assembly for a file +layers/cudnn_relu_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s +.PHONY : layers/cudnn_relu_layer.cpp.s + +layers/cudnn_sigmoid_layer.o: layers/cudnn_sigmoid_layer.cpp.o +.PHONY : layers/cudnn_sigmoid_layer.o + +# target to build an object file +layers/cudnn_sigmoid_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o +.PHONY : layers/cudnn_sigmoid_layer.cpp.o + +layers/cudnn_sigmoid_layer.i: layers/cudnn_sigmoid_layer.cpp.i +.PHONY : layers/cudnn_sigmoid_layer.i + +# target to preprocess a source file +layers/cudnn_sigmoid_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i +.PHONY : layers/cudnn_sigmoid_layer.cpp.i + +layers/cudnn_sigmoid_layer.s: layers/cudnn_sigmoid_layer.cpp.s +.PHONY : layers/cudnn_sigmoid_layer.s + +# target to generate assembly for a file +layers/cudnn_sigmoid_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s +.PHONY : layers/cudnn_sigmoid_layer.cpp.s + +layers/cudnn_softmax_layer.o: layers/cudnn_softmax_layer.cpp.o +.PHONY : layers/cudnn_softmax_layer.o + +# target to build an object file +layers/cudnn_softmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o +.PHONY : layers/cudnn_softmax_layer.cpp.o + +layers/cudnn_softmax_layer.i: layers/cudnn_softmax_layer.cpp.i +.PHONY : layers/cudnn_softmax_layer.i + +# target to preprocess a source file +layers/cudnn_softmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i +.PHONY : layers/cudnn_softmax_layer.cpp.i + +layers/cudnn_softmax_layer.s: layers/cudnn_softmax_layer.cpp.s +.PHONY : layers/cudnn_softmax_layer.s + +# target to generate assembly for a file +layers/cudnn_softmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s +.PHONY : layers/cudnn_softmax_layer.cpp.s + +layers/cudnn_tanh_layer.o: layers/cudnn_tanh_layer.cpp.o +.PHONY : layers/cudnn_tanh_layer.o + +# target to build an object file +layers/cudnn_tanh_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o +.PHONY : layers/cudnn_tanh_layer.cpp.o + +layers/cudnn_tanh_layer.i: layers/cudnn_tanh_layer.cpp.i +.PHONY : layers/cudnn_tanh_layer.i + +# target to preprocess a source file +layers/cudnn_tanh_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i +.PHONY : layers/cudnn_tanh_layer.cpp.i + +layers/cudnn_tanh_layer.s: layers/cudnn_tanh_layer.cpp.s +.PHONY : layers/cudnn_tanh_layer.s + +# target to generate assembly for a file +layers/cudnn_tanh_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s +.PHONY : layers/cudnn_tanh_layer.cpp.s + +layers/data_layer.o: layers/data_layer.cpp.o +.PHONY : layers/data_layer.o + +# target to build an object file +layers/data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o +.PHONY : layers/data_layer.cpp.o + +layers/data_layer.i: layers/data_layer.cpp.i +.PHONY : layers/data_layer.i + +# target to preprocess a source file +layers/data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i +.PHONY : layers/data_layer.cpp.i + +layers/data_layer.s: layers/data_layer.cpp.s +.PHONY : layers/data_layer.s + +# target to generate assembly for a file +layers/data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s +.PHONY : layers/data_layer.cpp.s + +layers/deconv_layer.o: layers/deconv_layer.cpp.o +.PHONY : layers/deconv_layer.o + +# target to build an object file +layers/deconv_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o +.PHONY : layers/deconv_layer.cpp.o + +layers/deconv_layer.i: layers/deconv_layer.cpp.i +.PHONY : layers/deconv_layer.i + +# target to preprocess a source file +layers/deconv_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i +.PHONY : layers/deconv_layer.cpp.i + +layers/deconv_layer.s: layers/deconv_layer.cpp.s +.PHONY : layers/deconv_layer.s + +# target to generate assembly for a file +layers/deconv_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s +.PHONY : layers/deconv_layer.cpp.s + +layers/dropout_layer.o: layers/dropout_layer.cpp.o +.PHONY : layers/dropout_layer.o + +# target to build an object file +layers/dropout_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o +.PHONY : layers/dropout_layer.cpp.o + +layers/dropout_layer.i: layers/dropout_layer.cpp.i +.PHONY : layers/dropout_layer.i + +# target to preprocess a source file +layers/dropout_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i +.PHONY : layers/dropout_layer.cpp.i + +layers/dropout_layer.s: layers/dropout_layer.cpp.s +.PHONY : layers/dropout_layer.s + +# target to generate assembly for a file +layers/dropout_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s +.PHONY : layers/dropout_layer.cpp.s + +layers/dummy_data_layer.o: layers/dummy_data_layer.cpp.o +.PHONY : layers/dummy_data_layer.o + +# target to build an object file +layers/dummy_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o +.PHONY : layers/dummy_data_layer.cpp.o + +layers/dummy_data_layer.i: layers/dummy_data_layer.cpp.i +.PHONY : layers/dummy_data_layer.i + +# target to preprocess a source file +layers/dummy_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i +.PHONY : layers/dummy_data_layer.cpp.i + +layers/dummy_data_layer.s: layers/dummy_data_layer.cpp.s +.PHONY : layers/dummy_data_layer.s + +# target to generate assembly for a file +layers/dummy_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s +.PHONY : layers/dummy_data_layer.cpp.s + +layers/eltwise_layer.o: layers/eltwise_layer.cpp.o +.PHONY : layers/eltwise_layer.o + +# target to build an object file +layers/eltwise_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o +.PHONY : layers/eltwise_layer.cpp.o + +layers/eltwise_layer.i: layers/eltwise_layer.cpp.i +.PHONY : layers/eltwise_layer.i + +# target to preprocess a source file +layers/eltwise_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i +.PHONY : layers/eltwise_layer.cpp.i + +layers/eltwise_layer.s: layers/eltwise_layer.cpp.s +.PHONY : layers/eltwise_layer.s + +# target to generate assembly for a file +layers/eltwise_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s +.PHONY : layers/eltwise_layer.cpp.s + +layers/euclidean_loss_layer.o: layers/euclidean_loss_layer.cpp.o +.PHONY : layers/euclidean_loss_layer.o + +# target to build an object file +layers/euclidean_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o +.PHONY : layers/euclidean_loss_layer.cpp.o + +layers/euclidean_loss_layer.i: layers/euclidean_loss_layer.cpp.i +.PHONY : layers/euclidean_loss_layer.i + +# target to preprocess a source file +layers/euclidean_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i +.PHONY : layers/euclidean_loss_layer.cpp.i + +layers/euclidean_loss_layer.s: layers/euclidean_loss_layer.cpp.s +.PHONY : layers/euclidean_loss_layer.s + +# target to generate assembly for a file +layers/euclidean_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s +.PHONY : layers/euclidean_loss_layer.cpp.s + +layers/exp_layer.o: layers/exp_layer.cpp.o +.PHONY : layers/exp_layer.o + +# target to build an object file +layers/exp_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o +.PHONY : layers/exp_layer.cpp.o + +layers/exp_layer.i: layers/exp_layer.cpp.i +.PHONY : layers/exp_layer.i + +# target to preprocess a source file +layers/exp_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i +.PHONY : layers/exp_layer.cpp.i + +layers/exp_layer.s: layers/exp_layer.cpp.s +.PHONY : layers/exp_layer.s + +# target to generate assembly for a file +layers/exp_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s +.PHONY : layers/exp_layer.cpp.s + +layers/filter_layer.o: layers/filter_layer.cpp.o +.PHONY : layers/filter_layer.o + +# target to build an object file +layers/filter_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o +.PHONY : layers/filter_layer.cpp.o + +layers/filter_layer.i: layers/filter_layer.cpp.i +.PHONY : layers/filter_layer.i + +# target to preprocess a source file +layers/filter_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i +.PHONY : layers/filter_layer.cpp.i + +layers/filter_layer.s: layers/filter_layer.cpp.s +.PHONY : layers/filter_layer.s + +# target to generate assembly for a file +layers/filter_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s +.PHONY : layers/filter_layer.cpp.s + +layers/flatten_layer.o: layers/flatten_layer.cpp.o +.PHONY : layers/flatten_layer.o + +# target to build an object file +layers/flatten_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o +.PHONY : layers/flatten_layer.cpp.o + +layers/flatten_layer.i: layers/flatten_layer.cpp.i +.PHONY : layers/flatten_layer.i + +# target to preprocess a source file +layers/flatten_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i +.PHONY : layers/flatten_layer.cpp.i + +layers/flatten_layer.s: layers/flatten_layer.cpp.s +.PHONY : layers/flatten_layer.s + +# target to generate assembly for a file +layers/flatten_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s +.PHONY : layers/flatten_layer.cpp.s + +layers/hdf5_data_layer.o: layers/hdf5_data_layer.cpp.o +.PHONY : layers/hdf5_data_layer.o + +# target to build an object file +layers/hdf5_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o +.PHONY : layers/hdf5_data_layer.cpp.o + +layers/hdf5_data_layer.i: layers/hdf5_data_layer.cpp.i +.PHONY : layers/hdf5_data_layer.i + +# target to preprocess a source file +layers/hdf5_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i +.PHONY : layers/hdf5_data_layer.cpp.i + +layers/hdf5_data_layer.s: layers/hdf5_data_layer.cpp.s +.PHONY : layers/hdf5_data_layer.s + +# target to generate assembly for a file +layers/hdf5_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s +.PHONY : layers/hdf5_data_layer.cpp.s + +layers/hdf5_output_layer.o: layers/hdf5_output_layer.cpp.o +.PHONY : layers/hdf5_output_layer.o + +# target to build an object file +layers/hdf5_output_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o +.PHONY : layers/hdf5_output_layer.cpp.o + +layers/hdf5_output_layer.i: layers/hdf5_output_layer.cpp.i +.PHONY : layers/hdf5_output_layer.i + +# target to preprocess a source file +layers/hdf5_output_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i +.PHONY : layers/hdf5_output_layer.cpp.i + +layers/hdf5_output_layer.s: layers/hdf5_output_layer.cpp.s +.PHONY : layers/hdf5_output_layer.s + +# target to generate assembly for a file +layers/hdf5_output_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s +.PHONY : layers/hdf5_output_layer.cpp.s + +layers/hinge_loss_layer.o: layers/hinge_loss_layer.cpp.o +.PHONY : layers/hinge_loss_layer.o + +# target to build an object file +layers/hinge_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o +.PHONY : layers/hinge_loss_layer.cpp.o + +layers/hinge_loss_layer.i: layers/hinge_loss_layer.cpp.i +.PHONY : layers/hinge_loss_layer.i + +# target to preprocess a source file +layers/hinge_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i +.PHONY : layers/hinge_loss_layer.cpp.i + +layers/hinge_loss_layer.s: layers/hinge_loss_layer.cpp.s +.PHONY : layers/hinge_loss_layer.s + +# target to generate assembly for a file +layers/hinge_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s +.PHONY : layers/hinge_loss_layer.cpp.s + +layers/im2col_layer.o: layers/im2col_layer.cpp.o +.PHONY : layers/im2col_layer.o + +# target to build an object file +layers/im2col_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o +.PHONY : layers/im2col_layer.cpp.o + +layers/im2col_layer.i: layers/im2col_layer.cpp.i +.PHONY : layers/im2col_layer.i + +# target to preprocess a source file +layers/im2col_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i +.PHONY : layers/im2col_layer.cpp.i + +layers/im2col_layer.s: layers/im2col_layer.cpp.s +.PHONY : layers/im2col_layer.s + +# target to generate assembly for a file +layers/im2col_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s +.PHONY : layers/im2col_layer.cpp.s + +layers/image_data_layer.o: layers/image_data_layer.cpp.o +.PHONY : layers/image_data_layer.o + +# target to build an object file +layers/image_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o +.PHONY : layers/image_data_layer.cpp.o + +layers/image_data_layer.i: layers/image_data_layer.cpp.i +.PHONY : layers/image_data_layer.i + +# target to preprocess a source file +layers/image_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i +.PHONY : layers/image_data_layer.cpp.i + +layers/image_data_layer.s: layers/image_data_layer.cpp.s +.PHONY : layers/image_data_layer.s + +# target to generate assembly for a file +layers/image_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s +.PHONY : layers/image_data_layer.cpp.s + +layers/infogain_loss_layer.o: layers/infogain_loss_layer.cpp.o +.PHONY : layers/infogain_loss_layer.o + +# target to build an object file +layers/infogain_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o +.PHONY : layers/infogain_loss_layer.cpp.o + +layers/infogain_loss_layer.i: layers/infogain_loss_layer.cpp.i +.PHONY : layers/infogain_loss_layer.i + +# target to preprocess a source file +layers/infogain_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i +.PHONY : layers/infogain_loss_layer.cpp.i + +layers/infogain_loss_layer.s: layers/infogain_loss_layer.cpp.s +.PHONY : layers/infogain_loss_layer.s + +# target to generate assembly for a file +layers/infogain_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s +.PHONY : layers/infogain_loss_layer.cpp.s + +layers/inner_product_layer.o: layers/inner_product_layer.cpp.o +.PHONY : layers/inner_product_layer.o + +# target to build an object file +layers/inner_product_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o +.PHONY : layers/inner_product_layer.cpp.o + +layers/inner_product_layer.i: layers/inner_product_layer.cpp.i +.PHONY : layers/inner_product_layer.i + +# target to preprocess a source file +layers/inner_product_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i +.PHONY : layers/inner_product_layer.cpp.i + +layers/inner_product_layer.s: layers/inner_product_layer.cpp.s +.PHONY : layers/inner_product_layer.s + +# target to generate assembly for a file +layers/inner_product_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s +.PHONY : layers/inner_product_layer.cpp.s + +layers/log_layer.o: layers/log_layer.cpp.o +.PHONY : layers/log_layer.o + +# target to build an object file +layers/log_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o +.PHONY : layers/log_layer.cpp.o + +layers/log_layer.i: layers/log_layer.cpp.i +.PHONY : layers/log_layer.i + +# target to preprocess a source file +layers/log_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i +.PHONY : layers/log_layer.cpp.i + +layers/log_layer.s: layers/log_layer.cpp.s +.PHONY : layers/log_layer.s + +# target to generate assembly for a file +layers/log_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s +.PHONY : layers/log_layer.cpp.s + +layers/loss_layer.o: layers/loss_layer.cpp.o +.PHONY : layers/loss_layer.o + +# target to build an object file +layers/loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o +.PHONY : layers/loss_layer.cpp.o + +layers/loss_layer.i: layers/loss_layer.cpp.i +.PHONY : layers/loss_layer.i + +# target to preprocess a source file +layers/loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i +.PHONY : layers/loss_layer.cpp.i + +layers/loss_layer.s: layers/loss_layer.cpp.s +.PHONY : layers/loss_layer.s + +# target to generate assembly for a file +layers/loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s +.PHONY : layers/loss_layer.cpp.s + +layers/lrn_layer.o: layers/lrn_layer.cpp.o +.PHONY : layers/lrn_layer.o + +# target to build an object file +layers/lrn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o +.PHONY : layers/lrn_layer.cpp.o + +layers/lrn_layer.i: layers/lrn_layer.cpp.i +.PHONY : layers/lrn_layer.i + +# target to preprocess a source file +layers/lrn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i +.PHONY : layers/lrn_layer.cpp.i + +layers/lrn_layer.s: layers/lrn_layer.cpp.s +.PHONY : layers/lrn_layer.s + +# target to generate assembly for a file +layers/lrn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s +.PHONY : layers/lrn_layer.cpp.s + +layers/memory_data_layer.o: layers/memory_data_layer.cpp.o +.PHONY : layers/memory_data_layer.o + +# target to build an object file +layers/memory_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o +.PHONY : layers/memory_data_layer.cpp.o + +layers/memory_data_layer.i: layers/memory_data_layer.cpp.i +.PHONY : layers/memory_data_layer.i + +# target to preprocess a source file +layers/memory_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i +.PHONY : layers/memory_data_layer.cpp.i + +layers/memory_data_layer.s: layers/memory_data_layer.cpp.s +.PHONY : layers/memory_data_layer.s + +# target to generate assembly for a file +layers/memory_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s +.PHONY : layers/memory_data_layer.cpp.s + +layers/multinomial_logistic_loss_layer.o: layers/multinomial_logistic_loss_layer.cpp.o +.PHONY : layers/multinomial_logistic_loss_layer.o + +# target to build an object file +layers/multinomial_logistic_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o +.PHONY : layers/multinomial_logistic_loss_layer.cpp.o + +layers/multinomial_logistic_loss_layer.i: layers/multinomial_logistic_loss_layer.cpp.i +.PHONY : layers/multinomial_logistic_loss_layer.i + +# target to preprocess a source file +layers/multinomial_logistic_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i +.PHONY : layers/multinomial_logistic_loss_layer.cpp.i + +layers/multinomial_logistic_loss_layer.s: layers/multinomial_logistic_loss_layer.cpp.s +.PHONY : layers/multinomial_logistic_loss_layer.s + +# target to generate assembly for a file +layers/multinomial_logistic_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s +.PHONY : layers/multinomial_logistic_loss_layer.cpp.s + +layers/mvn_layer.o: layers/mvn_layer.cpp.o +.PHONY : layers/mvn_layer.o + +# target to build an object file +layers/mvn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o +.PHONY : layers/mvn_layer.cpp.o + +layers/mvn_layer.i: layers/mvn_layer.cpp.i +.PHONY : layers/mvn_layer.i + +# target to preprocess a source file +layers/mvn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i +.PHONY : layers/mvn_layer.cpp.i + +layers/mvn_layer.s: layers/mvn_layer.cpp.s +.PHONY : layers/mvn_layer.s + +# target to generate assembly for a file +layers/mvn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s +.PHONY : layers/mvn_layer.cpp.s + +layers/neuron_layer.o: layers/neuron_layer.cpp.o +.PHONY : layers/neuron_layer.o + +# target to build an object file +layers/neuron_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o +.PHONY : layers/neuron_layer.cpp.o + +layers/neuron_layer.i: layers/neuron_layer.cpp.i +.PHONY : layers/neuron_layer.i + +# target to preprocess a source file +layers/neuron_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i +.PHONY : layers/neuron_layer.cpp.i + +layers/neuron_layer.s: layers/neuron_layer.cpp.s +.PHONY : layers/neuron_layer.s + +# target to generate assembly for a file +layers/neuron_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s +.PHONY : layers/neuron_layer.cpp.s + +layers/pooling_layer.o: layers/pooling_layer.cpp.o +.PHONY : layers/pooling_layer.o + +# target to build an object file +layers/pooling_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o +.PHONY : layers/pooling_layer.cpp.o + +layers/pooling_layer.i: layers/pooling_layer.cpp.i +.PHONY : layers/pooling_layer.i + +# target to preprocess a source file +layers/pooling_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i +.PHONY : layers/pooling_layer.cpp.i + +layers/pooling_layer.s: layers/pooling_layer.cpp.s +.PHONY : layers/pooling_layer.s + +# target to generate assembly for a file +layers/pooling_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s +.PHONY : layers/pooling_layer.cpp.s + +layers/power_layer.o: layers/power_layer.cpp.o +.PHONY : layers/power_layer.o + +# target to build an object file +layers/power_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o +.PHONY : layers/power_layer.cpp.o + +layers/power_layer.i: layers/power_layer.cpp.i +.PHONY : layers/power_layer.i + +# target to preprocess a source file +layers/power_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i +.PHONY : layers/power_layer.cpp.i + +layers/power_layer.s: layers/power_layer.cpp.s +.PHONY : layers/power_layer.s + +# target to generate assembly for a file +layers/power_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s +.PHONY : layers/power_layer.cpp.s + +layers/prelu_layer.o: layers/prelu_layer.cpp.o +.PHONY : layers/prelu_layer.o + +# target to build an object file +layers/prelu_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o +.PHONY : layers/prelu_layer.cpp.o + +layers/prelu_layer.i: layers/prelu_layer.cpp.i +.PHONY : layers/prelu_layer.i + +# target to preprocess a source file +layers/prelu_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i +.PHONY : layers/prelu_layer.cpp.i + +layers/prelu_layer.s: layers/prelu_layer.cpp.s +.PHONY : layers/prelu_layer.s + +# target to generate assembly for a file +layers/prelu_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s +.PHONY : layers/prelu_layer.cpp.s + +layers/reduction_layer.o: layers/reduction_layer.cpp.o +.PHONY : layers/reduction_layer.o + +# target to build an object file +layers/reduction_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o +.PHONY : layers/reduction_layer.cpp.o + +layers/reduction_layer.i: layers/reduction_layer.cpp.i +.PHONY : layers/reduction_layer.i + +# target to preprocess a source file +layers/reduction_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i +.PHONY : layers/reduction_layer.cpp.i + +layers/reduction_layer.s: layers/reduction_layer.cpp.s +.PHONY : layers/reduction_layer.s + +# target to generate assembly for a file +layers/reduction_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s +.PHONY : layers/reduction_layer.cpp.s + +layers/relu_layer.o: layers/relu_layer.cpp.o +.PHONY : layers/relu_layer.o + +# target to build an object file +layers/relu_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o +.PHONY : layers/relu_layer.cpp.o + +layers/relu_layer.i: layers/relu_layer.cpp.i +.PHONY : layers/relu_layer.i + +# target to preprocess a source file +layers/relu_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i +.PHONY : layers/relu_layer.cpp.i + +layers/relu_layer.s: layers/relu_layer.cpp.s +.PHONY : layers/relu_layer.s + +# target to generate assembly for a file +layers/relu_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s +.PHONY : layers/relu_layer.cpp.s + +layers/reshape_layer.o: layers/reshape_layer.cpp.o +.PHONY : layers/reshape_layer.o + +# target to build an object file +layers/reshape_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o +.PHONY : layers/reshape_layer.cpp.o + +layers/reshape_layer.i: layers/reshape_layer.cpp.i +.PHONY : layers/reshape_layer.i + +# target to preprocess a source file +layers/reshape_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i +.PHONY : layers/reshape_layer.cpp.i + +layers/reshape_layer.s: layers/reshape_layer.cpp.s +.PHONY : layers/reshape_layer.s + +# target to generate assembly for a file +layers/reshape_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s +.PHONY : layers/reshape_layer.cpp.s + +layers/sigmoid_cross_entropy_loss_layer.o: layers/sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : layers/sigmoid_cross_entropy_loss_layer.o + +# target to build an object file +layers/sigmoid_cross_entropy_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.o + +layers/sigmoid_cross_entropy_loss_layer.i: layers/sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : layers/sigmoid_cross_entropy_loss_layer.i + +# target to preprocess a source file +layers/sigmoid_cross_entropy_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.i + +layers/sigmoid_cross_entropy_loss_layer.s: layers/sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : layers/sigmoid_cross_entropy_loss_layer.s + +# target to generate assembly for a file +layers/sigmoid_cross_entropy_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.s + +layers/sigmoid_layer.o: layers/sigmoid_layer.cpp.o +.PHONY : layers/sigmoid_layer.o + +# target to build an object file +layers/sigmoid_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o +.PHONY : layers/sigmoid_layer.cpp.o + +layers/sigmoid_layer.i: layers/sigmoid_layer.cpp.i +.PHONY : layers/sigmoid_layer.i + +# target to preprocess a source file +layers/sigmoid_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i +.PHONY : layers/sigmoid_layer.cpp.i + +layers/sigmoid_layer.s: layers/sigmoid_layer.cpp.s +.PHONY : layers/sigmoid_layer.s + +# target to generate assembly for a file +layers/sigmoid_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s +.PHONY : layers/sigmoid_layer.cpp.s + +layers/silence_layer.o: layers/silence_layer.cpp.o +.PHONY : layers/silence_layer.o + +# target to build an object file +layers/silence_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o +.PHONY : layers/silence_layer.cpp.o + +layers/silence_layer.i: layers/silence_layer.cpp.i +.PHONY : layers/silence_layer.i + +# target to preprocess a source file +layers/silence_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i +.PHONY : layers/silence_layer.cpp.i + +layers/silence_layer.s: layers/silence_layer.cpp.s +.PHONY : layers/silence_layer.s + +# target to generate assembly for a file +layers/silence_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s +.PHONY : layers/silence_layer.cpp.s + +layers/slice_layer.o: layers/slice_layer.cpp.o +.PHONY : layers/slice_layer.o + +# target to build an object file +layers/slice_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o +.PHONY : layers/slice_layer.cpp.o + +layers/slice_layer.i: layers/slice_layer.cpp.i +.PHONY : layers/slice_layer.i + +# target to preprocess a source file +layers/slice_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i +.PHONY : layers/slice_layer.cpp.i + +layers/slice_layer.s: layers/slice_layer.cpp.s +.PHONY : layers/slice_layer.s + +# target to generate assembly for a file +layers/slice_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s +.PHONY : layers/slice_layer.cpp.s + +layers/softmax_layer.o: layers/softmax_layer.cpp.o +.PHONY : layers/softmax_layer.o + +# target to build an object file +layers/softmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o +.PHONY : layers/softmax_layer.cpp.o + +layers/softmax_layer.i: layers/softmax_layer.cpp.i +.PHONY : layers/softmax_layer.i + +# target to preprocess a source file +layers/softmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i +.PHONY : layers/softmax_layer.cpp.i + +layers/softmax_layer.s: layers/softmax_layer.cpp.s +.PHONY : layers/softmax_layer.s + +# target to generate assembly for a file +layers/softmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s +.PHONY : layers/softmax_layer.cpp.s + +layers/softmax_loss_layer.o: layers/softmax_loss_layer.cpp.o +.PHONY : layers/softmax_loss_layer.o + +# target to build an object file +layers/softmax_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o +.PHONY : layers/softmax_loss_layer.cpp.o + +layers/softmax_loss_layer.i: layers/softmax_loss_layer.cpp.i +.PHONY : layers/softmax_loss_layer.i + +# target to preprocess a source file +layers/softmax_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i +.PHONY : layers/softmax_loss_layer.cpp.i + +layers/softmax_loss_layer.s: layers/softmax_loss_layer.cpp.s +.PHONY : layers/softmax_loss_layer.s + +# target to generate assembly for a file +layers/softmax_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s +.PHONY : layers/softmax_loss_layer.cpp.s + +layers/split_layer.o: layers/split_layer.cpp.o +.PHONY : layers/split_layer.o + +# target to build an object file +layers/split_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o +.PHONY : layers/split_layer.cpp.o + +layers/split_layer.i: layers/split_layer.cpp.i +.PHONY : layers/split_layer.i + +# target to preprocess a source file +layers/split_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i +.PHONY : layers/split_layer.cpp.i + +layers/split_layer.s: layers/split_layer.cpp.s +.PHONY : layers/split_layer.s + +# target to generate assembly for a file +layers/split_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s +.PHONY : layers/split_layer.cpp.s + +layers/spp_layer.o: layers/spp_layer.cpp.o +.PHONY : layers/spp_layer.o + +# target to build an object file +layers/spp_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o +.PHONY : layers/spp_layer.cpp.o + +layers/spp_layer.i: layers/spp_layer.cpp.i +.PHONY : layers/spp_layer.i + +# target to preprocess a source file +layers/spp_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i +.PHONY : layers/spp_layer.cpp.i + +layers/spp_layer.s: layers/spp_layer.cpp.s +.PHONY : layers/spp_layer.s + +# target to generate assembly for a file +layers/spp_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s +.PHONY : layers/spp_layer.cpp.s + +layers/tanh_layer.o: layers/tanh_layer.cpp.o +.PHONY : layers/tanh_layer.o + +# target to build an object file +layers/tanh_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o +.PHONY : layers/tanh_layer.cpp.o + +layers/tanh_layer.i: layers/tanh_layer.cpp.i +.PHONY : layers/tanh_layer.i + +# target to preprocess a source file +layers/tanh_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i +.PHONY : layers/tanh_layer.cpp.i + +layers/tanh_layer.s: layers/tanh_layer.cpp.s +.PHONY : layers/tanh_layer.s + +# target to generate assembly for a file +layers/tanh_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s +.PHONY : layers/tanh_layer.cpp.s + +layers/threshold_layer.o: layers/threshold_layer.cpp.o +.PHONY : layers/threshold_layer.o + +# target to build an object file +layers/threshold_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o +.PHONY : layers/threshold_layer.cpp.o + +layers/threshold_layer.i: layers/threshold_layer.cpp.i +.PHONY : layers/threshold_layer.i + +# target to preprocess a source file +layers/threshold_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i +.PHONY : layers/threshold_layer.cpp.i + +layers/threshold_layer.s: layers/threshold_layer.cpp.s +.PHONY : layers/threshold_layer.s + +# target to generate assembly for a file +layers/threshold_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s +.PHONY : layers/threshold_layer.cpp.s + +layers/window_data_layer.o: layers/window_data_layer.cpp.o +.PHONY : layers/window_data_layer.o + +# target to build an object file +layers/window_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o +.PHONY : layers/window_data_layer.cpp.o + +layers/window_data_layer.i: layers/window_data_layer.cpp.i +.PHONY : layers/window_data_layer.i + +# target to preprocess a source file +layers/window_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i +.PHONY : layers/window_data_layer.cpp.i + +layers/window_data_layer.s: layers/window_data_layer.cpp.s +.PHONY : layers/window_data_layer.s + +# target to generate assembly for a file +layers/window_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s +.PHONY : layers/window_data_layer.cpp.s + +net.o: net.cpp.o +.PHONY : net.o + +# target to build an object file +net.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o +.PHONY : net.cpp.o + +net.i: net.cpp.i +.PHONY : net.i + +# target to preprocess a source file +net.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.i +.PHONY : net.cpp.i + +net.s: net.cpp.s +.PHONY : net.s + +# target to generate assembly for a file +net.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.s +.PHONY : net.cpp.s + +solver.o: solver.cpp.o +.PHONY : solver.o + +# target to build an object file +solver.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o +.PHONY : solver.cpp.o + +solver.i: solver.cpp.i +.PHONY : solver.i + +# target to preprocess a source file +solver.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.i +.PHONY : solver.cpp.i + +solver.s: solver.cpp.s +.PHONY : solver.s + +# target to generate assembly for a file +solver.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.s +.PHONY : solver.cpp.s + +syncedmem.o: syncedmem.cpp.o +.PHONY : syncedmem.o + +# target to build an object file +syncedmem.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o +.PHONY : syncedmem.cpp.o + +syncedmem.i: syncedmem.cpp.i +.PHONY : syncedmem.i + +# target to preprocess a source file +syncedmem.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i +.PHONY : syncedmem.cpp.i + +syncedmem.s: syncedmem.cpp.s +.PHONY : syncedmem.s + +# target to generate assembly for a file +syncedmem.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s +.PHONY : syncedmem.cpp.s + +util/benchmark.o: util/benchmark.cpp.o +.PHONY : util/benchmark.o + +# target to build an object file +util/benchmark.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o +.PHONY : util/benchmark.cpp.o + +util/benchmark.i: util/benchmark.cpp.i +.PHONY : util/benchmark.i + +# target to preprocess a source file +util/benchmark.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i +.PHONY : util/benchmark.cpp.i + +util/benchmark.s: util/benchmark.cpp.s +.PHONY : util/benchmark.s + +# target to generate assembly for a file +util/benchmark.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s +.PHONY : util/benchmark.cpp.s + +util/cudnn.o: util/cudnn.cpp.o +.PHONY : util/cudnn.o + +# target to build an object file +util/cudnn.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o +.PHONY : util/cudnn.cpp.o + +util/cudnn.i: util/cudnn.cpp.i +.PHONY : util/cudnn.i + +# target to preprocess a source file +util/cudnn.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i +.PHONY : util/cudnn.cpp.i + +util/cudnn.s: util/cudnn.cpp.s +.PHONY : util/cudnn.s + +# target to generate assembly for a file +util/cudnn.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s +.PHONY : util/cudnn.cpp.s + +util/db.o: util/db.cpp.o +.PHONY : util/db.o + +# target to build an object file +util/db.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o +.PHONY : util/db.cpp.o + +util/db.i: util/db.cpp.i +.PHONY : util/db.i + +# target to preprocess a source file +util/db.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i +.PHONY : util/db.cpp.i + +util/db.s: util/db.cpp.s +.PHONY : util/db.s + +# target to generate assembly for a file +util/db.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s +.PHONY : util/db.cpp.s + +util/db_leveldb.o: util/db_leveldb.cpp.o +.PHONY : util/db_leveldb.o + +# target to build an object file +util/db_leveldb.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o +.PHONY : util/db_leveldb.cpp.o + +util/db_leveldb.i: util/db_leveldb.cpp.i +.PHONY : util/db_leveldb.i + +# target to preprocess a source file +util/db_leveldb.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i +.PHONY : util/db_leveldb.cpp.i + +util/db_leveldb.s: util/db_leveldb.cpp.s +.PHONY : util/db_leveldb.s + +# target to generate assembly for a file +util/db_leveldb.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s +.PHONY : util/db_leveldb.cpp.s + +util/db_lmdb.o: util/db_lmdb.cpp.o +.PHONY : util/db_lmdb.o + +# target to build an object file +util/db_lmdb.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o +.PHONY : util/db_lmdb.cpp.o + +util/db_lmdb.i: util/db_lmdb.cpp.i +.PHONY : util/db_lmdb.i + +# target to preprocess a source file +util/db_lmdb.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i +.PHONY : util/db_lmdb.cpp.i + +util/db_lmdb.s: util/db_lmdb.cpp.s +.PHONY : util/db_lmdb.s + +# target to generate assembly for a file +util/db_lmdb.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s +.PHONY : util/db_lmdb.cpp.s + +util/im2col.o: util/im2col.cpp.o +.PHONY : util/im2col.o + +# target to build an object file +util/im2col.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o +.PHONY : util/im2col.cpp.o + +util/im2col.i: util/im2col.cpp.i +.PHONY : util/im2col.i + +# target to preprocess a source file +util/im2col.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i +.PHONY : util/im2col.cpp.i + +util/im2col.s: util/im2col.cpp.s +.PHONY : util/im2col.s + +# target to generate assembly for a file +util/im2col.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s +.PHONY : util/im2col.cpp.s + +util/insert_splits.o: util/insert_splits.cpp.o +.PHONY : util/insert_splits.o + +# target to build an object file +util/insert_splits.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o +.PHONY : util/insert_splits.cpp.o + +util/insert_splits.i: util/insert_splits.cpp.i +.PHONY : util/insert_splits.i + +# target to preprocess a source file +util/insert_splits.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i +.PHONY : util/insert_splits.cpp.i + +util/insert_splits.s: util/insert_splits.cpp.s +.PHONY : util/insert_splits.s + +# target to generate assembly for a file +util/insert_splits.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s +.PHONY : util/insert_splits.cpp.s + +util/io.o: util/io.cpp.o +.PHONY : util/io.o + +# target to build an object file +util/io.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o +.PHONY : util/io.cpp.o + +util/io.i: util/io.cpp.i +.PHONY : util/io.i + +# target to preprocess a source file +util/io.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i +.PHONY : util/io.cpp.i + +util/io.s: util/io.cpp.s +.PHONY : util/io.s + +# target to generate assembly for a file +util/io.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s +.PHONY : util/io.cpp.s + +util/math_functions.o: util/math_functions.cpp.o +.PHONY : util/math_functions.o + +# target to build an object file +util/math_functions.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o +.PHONY : util/math_functions.cpp.o + +util/math_functions.i: util/math_functions.cpp.i +.PHONY : util/math_functions.i + +# target to preprocess a source file +util/math_functions.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i +.PHONY : util/math_functions.cpp.i + +util/math_functions.s: util/math_functions.cpp.s +.PHONY : util/math_functions.s + +# target to generate assembly for a file +util/math_functions.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s +.PHONY : util/math_functions.cpp.s + +util/ocl_util.o: util/ocl_util.cpp.o +.PHONY : util/ocl_util.o + +# target to build an object file +util/ocl_util.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o +.PHONY : util/ocl_util.cpp.o + +util/ocl_util.i: util/ocl_util.cpp.i +.PHONY : util/ocl_util.i + +# target to preprocess a source file +util/ocl_util.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i +.PHONY : util/ocl_util.cpp.i + +util/ocl_util.s: util/ocl_util.cpp.s +.PHONY : util/ocl_util.s + +# target to generate assembly for a file +util/ocl_util.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s +.PHONY : util/ocl_util.cpp.s + +util/ocl_wrapper.o: util/ocl_wrapper.cpp.o +.PHONY : util/ocl_wrapper.o + +# target to build an object file +util/ocl_wrapper.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o +.PHONY : util/ocl_wrapper.cpp.o + +util/ocl_wrapper.i: util/ocl_wrapper.cpp.i +.PHONY : util/ocl_wrapper.i + +# target to preprocess a source file +util/ocl_wrapper.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i +.PHONY : util/ocl_wrapper.cpp.i + +util/ocl_wrapper.s: util/ocl_wrapper.cpp.s +.PHONY : util/ocl_wrapper.s + +# target to generate assembly for a file +util/ocl_wrapper.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s +.PHONY : util/ocl_wrapper.cpp.s + +util/upgrade_proto.o: util/upgrade_proto.cpp.o +.PHONY : util/upgrade_proto.o + +# target to build an object file +util/upgrade_proto.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o +.PHONY : util/upgrade_proto.cpp.o + +util/upgrade_proto.i: util/upgrade_proto.cpp.i +.PHONY : util/upgrade_proto.i + +# target to preprocess a source file +util/upgrade_proto.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i +.PHONY : util/upgrade_proto.cpp.i + +util/upgrade_proto.s: util/upgrade_proto.cpp.s +.PHONY : util/upgrade_proto.s + +# target to generate assembly for a file +util/upgrade_proto.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s +.PHONY : util/upgrade_proto.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... caffe" + @echo "... edit_cache" + @echo "... install" + @echo "... install/local" + @echo "... install/strip" + @echo "... list_install_components" + @echo "... proto" + @echo "... rebuild_cache" + @echo "... __/__/include/caffe/proto/caffe.pb.o" + @echo "... __/__/include/caffe/proto/caffe.pb.i" + @echo "... __/__/include/caffe/proto/caffe.pb.s" + @echo "... blob.o" + @echo "... blob.i" + @echo "... blob.s" + @echo "... common.o" + @echo "... common.i" + @echo "... common.s" + @echo "... data_transformer.o" + @echo "... data_transformer.i" + @echo "... data_transformer.s" + @echo "... device.o" + @echo "... device.i" + @echo "... device.s" + @echo "... internal_thread.o" + @echo "... internal_thread.i" + @echo "... internal_thread.s" + @echo "... layer_factory.o" + @echo "... layer_factory.i" + @echo "... layer_factory.s" + @echo "... layers/absval_layer.o" + @echo "... layers/absval_layer.i" + @echo "... layers/absval_layer.s" + @echo "... layers/accuracy_layer.o" + @echo "... layers/accuracy_layer.i" + @echo "... layers/accuracy_layer.s" + @echo "... layers/argmax_layer.o" + @echo "... layers/argmax_layer.i" + @echo "... layers/argmax_layer.s" + @echo "... layers/base_conv_layer.o" + @echo "... layers/base_conv_layer.i" + @echo "... layers/base_conv_layer.s" + @echo "... layers/base_data_layer.o" + @echo "... layers/base_data_layer.i" + @echo "... layers/base_data_layer.s" + @echo "... layers/bnll_layer.o" + @echo "... layers/bnll_layer.i" + @echo "... layers/bnll_layer.s" + @echo "... layers/concat_layer.o" + @echo "... layers/concat_layer.i" + @echo "... layers/concat_layer.s" + @echo "... layers/contrastive_loss_layer.o" + @echo "... layers/contrastive_loss_layer.i" + @echo "... layers/contrastive_loss_layer.s" + @echo "... layers/conv_layer.o" + @echo "... layers/conv_layer.i" + @echo "... layers/conv_layer.s" + @echo "... layers/cudnn_conv_layer.o" + @echo "... layers/cudnn_conv_layer.i" + @echo "... layers/cudnn_conv_layer.s" + @echo "... layers/cudnn_pooling_layer.o" + @echo "... layers/cudnn_pooling_layer.i" + @echo "... layers/cudnn_pooling_layer.s" + @echo "... layers/cudnn_relu_layer.o" + @echo "... layers/cudnn_relu_layer.i" + @echo "... layers/cudnn_relu_layer.s" + @echo "... layers/cudnn_sigmoid_layer.o" + @echo "... layers/cudnn_sigmoid_layer.i" + @echo "... layers/cudnn_sigmoid_layer.s" + @echo "... layers/cudnn_softmax_layer.o" + @echo "... layers/cudnn_softmax_layer.i" + @echo "... layers/cudnn_softmax_layer.s" + @echo "... layers/cudnn_tanh_layer.o" + @echo "... layers/cudnn_tanh_layer.i" + @echo "... layers/cudnn_tanh_layer.s" + @echo "... layers/data_layer.o" + @echo "... layers/data_layer.i" + @echo "... layers/data_layer.s" + @echo "... layers/deconv_layer.o" + @echo "... layers/deconv_layer.i" + @echo "... layers/deconv_layer.s" + @echo "... layers/dropout_layer.o" + @echo "... layers/dropout_layer.i" + @echo "... layers/dropout_layer.s" + @echo "... layers/dummy_data_layer.o" + @echo "... layers/dummy_data_layer.i" + @echo "... layers/dummy_data_layer.s" + @echo "... layers/eltwise_layer.o" + @echo "... layers/eltwise_layer.i" + @echo "... layers/eltwise_layer.s" + @echo "... layers/euclidean_loss_layer.o" + @echo "... layers/euclidean_loss_layer.i" + @echo "... layers/euclidean_loss_layer.s" + @echo "... layers/exp_layer.o" + @echo "... layers/exp_layer.i" + @echo "... layers/exp_layer.s" + @echo "... layers/filter_layer.o" + @echo "... layers/filter_layer.i" + @echo "... layers/filter_layer.s" + @echo "... layers/flatten_layer.o" + @echo "... layers/flatten_layer.i" + @echo "... layers/flatten_layer.s" + @echo "... layers/hdf5_data_layer.o" + @echo "... layers/hdf5_data_layer.i" + @echo "... layers/hdf5_data_layer.s" + @echo "... layers/hdf5_output_layer.o" + @echo "... layers/hdf5_output_layer.i" + @echo "... layers/hdf5_output_layer.s" + @echo "... layers/hinge_loss_layer.o" + @echo "... layers/hinge_loss_layer.i" + @echo "... layers/hinge_loss_layer.s" + @echo "... layers/im2col_layer.o" + @echo "... layers/im2col_layer.i" + @echo "... layers/im2col_layer.s" + @echo "... layers/image_data_layer.o" + @echo "... layers/image_data_layer.i" + @echo "... layers/image_data_layer.s" + @echo "... layers/infogain_loss_layer.o" + @echo "... layers/infogain_loss_layer.i" + @echo "... layers/infogain_loss_layer.s" + @echo "... layers/inner_product_layer.o" + @echo "... layers/inner_product_layer.i" + @echo "... layers/inner_product_layer.s" + @echo "... layers/log_layer.o" + @echo "... layers/log_layer.i" + @echo "... layers/log_layer.s" + @echo "... layers/loss_layer.o" + @echo "... layers/loss_layer.i" + @echo "... layers/loss_layer.s" + @echo "... layers/lrn_layer.o" + @echo "... layers/lrn_layer.i" + @echo "... layers/lrn_layer.s" + @echo "... layers/memory_data_layer.o" + @echo "... layers/memory_data_layer.i" + @echo "... layers/memory_data_layer.s" + @echo "... layers/multinomial_logistic_loss_layer.o" + @echo "... layers/multinomial_logistic_loss_layer.i" + @echo "... layers/multinomial_logistic_loss_layer.s" + @echo "... layers/mvn_layer.o" + @echo "... layers/mvn_layer.i" + @echo "... layers/mvn_layer.s" + @echo "... layers/neuron_layer.o" + @echo "... layers/neuron_layer.i" + @echo "... layers/neuron_layer.s" + @echo "... layers/pooling_layer.o" + @echo "... layers/pooling_layer.i" + @echo "... layers/pooling_layer.s" + @echo "... layers/power_layer.o" + @echo "... layers/power_layer.i" + @echo "... layers/power_layer.s" + @echo "... layers/prelu_layer.o" + @echo "... layers/prelu_layer.i" + @echo "... layers/prelu_layer.s" + @echo "... layers/reduction_layer.o" + @echo "... layers/reduction_layer.i" + @echo "... layers/reduction_layer.s" + @echo "... layers/relu_layer.o" + @echo "... layers/relu_layer.i" + @echo "... layers/relu_layer.s" + @echo "... layers/reshape_layer.o" + @echo "... layers/reshape_layer.i" + @echo "... layers/reshape_layer.s" + @echo "... layers/sigmoid_cross_entropy_loss_layer.o" + @echo "... layers/sigmoid_cross_entropy_loss_layer.i" + @echo "... layers/sigmoid_cross_entropy_loss_layer.s" + @echo "... layers/sigmoid_layer.o" + @echo "... layers/sigmoid_layer.i" + @echo "... layers/sigmoid_layer.s" + @echo "... layers/silence_layer.o" + @echo "... layers/silence_layer.i" + @echo "... layers/silence_layer.s" + @echo "... layers/slice_layer.o" + @echo "... layers/slice_layer.i" + @echo "... layers/slice_layer.s" + @echo "... layers/softmax_layer.o" + @echo "... layers/softmax_layer.i" + @echo "... layers/softmax_layer.s" + @echo "... layers/softmax_loss_layer.o" + @echo "... layers/softmax_loss_layer.i" + @echo "... layers/softmax_loss_layer.s" + @echo "... layers/split_layer.o" + @echo "... layers/split_layer.i" + @echo "... layers/split_layer.s" + @echo "... layers/spp_layer.o" + @echo "... layers/spp_layer.i" + @echo "... layers/spp_layer.s" + @echo "... layers/tanh_layer.o" + @echo "... layers/tanh_layer.i" + @echo "... layers/tanh_layer.s" + @echo "... layers/threshold_layer.o" + @echo "... layers/threshold_layer.i" + @echo "... layers/threshold_layer.s" + @echo "... layers/window_data_layer.o" + @echo "... layers/window_data_layer.i" + @echo "... layers/window_data_layer.s" + @echo "... net.o" + @echo "... net.i" + @echo "... net.s" + @echo "... solver.o" + @echo "... solver.i" + @echo "... solver.s" + @echo "... syncedmem.o" + @echo "... syncedmem.i" + @echo "... syncedmem.s" + @echo "... util/benchmark.o" + @echo "... util/benchmark.i" + @echo "... util/benchmark.s" + @echo "... util/cudnn.o" + @echo "... util/cudnn.i" + @echo "... util/cudnn.s" + @echo "... util/db.o" + @echo "... util/db.i" + @echo "... util/db.s" + @echo "... util/db_leveldb.o" + @echo "... util/db_leveldb.i" + @echo "... util/db_leveldb.s" + @echo "... util/db_lmdb.o" + @echo "... util/db_lmdb.i" + @echo "... util/db_lmdb.s" + @echo "... util/im2col.o" + @echo "... util/im2col.i" + @echo "... util/im2col.s" + @echo "... util/insert_splits.o" + @echo "... util/insert_splits.i" + @echo "... util/insert_splits.s" + @echo "... util/io.o" + @echo "... util/io.i" + @echo "... util/io.s" + @echo "... util/math_functions.o" + @echo "... util/math_functions.i" + @echo "... util/math_functions.s" + @echo "... util/ocl_util.o" + @echo "... util/ocl_util.i" + @echo "... util/ocl_util.s" + @echo "... util/ocl_wrapper.o" + @echo "... util/ocl_wrapper.i" + @echo "... util/ocl_wrapper.s" + @echo "... util/upgrade_proto.o" + @echo "... util/upgrade_proto.i" + @echo "... util/upgrade_proto.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/caffe/cmake_install.cmake b/src/caffe/cmake_install.cmake new file mode 100644 index 00000000..f98ef538 --- /dev/null +++ b/src/caffe/cmake_install.cmake @@ -0,0 +1,79 @@ +# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe + +# Set the install prefix +IF(NOT DEFINED CMAKE_INSTALL_PREFIX) + SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") +ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) +STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + IF(BUILD_TYPE) + STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + ELSE(BUILD_TYPE) + SET(CMAKE_INSTALL_CONFIG_NAME "Release") + ENDIF(BUILD_TYPE) + MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + +# Set the component getting installed. +IF(NOT CMAKE_INSTALL_COMPONENT) + IF(COMPONENT) + MESSAGE(STATUS "Install component: \"${COMPONENT}\"") + SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + ELSE(COMPONENT) + SET(CMAKE_INSTALL_COMPONENT) + ENDIF(COMPONENT) +ENDIF(NOT CMAKE_INSTALL_COMPONENT) + +# Install shared libraries without execute permission? +IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + SET(CMAKE_INSTALL_SO_NO_EXE "1") +ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include" TYPE DIRECTORY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/caffe/proto" TYPE FILE FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND + NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + FILE(RPATH_CHECK + FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" + RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib") + ENDIF() + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libcaffe.so") + IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND + NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + FILE(RPATH_CHANGE + FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" + OLD_RPATH "/usr/local/cuda/lib64:/usr/local/lib:::::::::::::::::::::::::::::::::::::::::::::::::::::::::" + NEW_RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib") + IF(CMAKE_INSTALL_DO_STRIP) + EXECUTE_PROCESS(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + ENDIF(CMAKE_INSTALL_DO_STRIP) + ENDIF() +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE STATIC_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libproto.a") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/python/caffe/proto" TYPE PROGRAM FILES + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/__init__.py" + ) +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_LOCAL_ONLY) + # Include the install script for each subdirectory. + INCLUDE("/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/cmake_install.cmake") + +ENDIF(NOT CMAKE_INSTALL_LOCAL_ONLY) + diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index c4fe1195..5d56493b 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -197,6 +197,7 @@ void Caffe::DeviceQuery() { << (prop.kernelExecTimeoutEnabled ? "Yes" : "No"); return; */ + amdDevice.DeviceQuery(); } diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 3ce6cefe..7a866c11 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -44,7 +44,8 @@ std::string oclKernelPath="./src/caffe/ocl/"; Device::~Device(){ //clAmdBlasTeardown(); - free((void*)platformIDs); + ReleaseKernels(); + free((void*)platformIDs); free(DeviceIDs); clReleaseProgram(Program); clReleaseCommandQueue(CommandQueue); @@ -74,7 +75,7 @@ cl_int Device::Init(){ GetDeviceInfo(); cl_uint uiNumDevices; cl_bool unified_memory = false; - switch(Caffe::mode()) { +/* switch(Caffe::mode()) { case Caffe::GPU: //choose_gpu(); clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); @@ -107,7 +108,8 @@ cl_int Device::Init(){ OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) ); LOG(INFO) << "picked device type: CPU"; break; - case Caffe::APU: +*/ +// case Caffe::APU: clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); uiNumDevices = numDevices; if(0 == uiNumDevices){ @@ -126,10 +128,10 @@ cl_int Device::Init(){ } } LOG(INFO) << "picked device type: APU"; - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + // break; + // default: + // LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + // } //Create Context Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); @@ -315,6 +317,15 @@ cl_kernel Device::GetKernel(std::string kernel_name) return Kernels[kernel_name]; } +void Device::ReleaseKernels() +{ + std::map::iterator it; + for(it = Kernels.begin(); it != Kernels.end(); it++) + { + clReleaseKernel(it->second); + } +} + void Device::DisplayPlatformInfo(){ cl_int err; size_t size; @@ -413,6 +424,26 @@ void Device::GetDeviceInfo(){ } +void Device::DeviceQuery() +{ + //Get Platform Infomation + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); + if(res != CL_SUCCESS){ + fprintf(stderr, "Err: Failed to Get Platform Info\n", res); + return; + } + platformName[nameLen] = 0; + + GetDeviceInfo(); +} + template void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){ cl_int err; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 855c00e1..8f7d8f82 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -33,7 +33,7 @@ void ConvolutionLayer::Forward_cpu(const vector*>& bottom, } } - CHECK_BLOB_DATA(top[0],20, "top[0]"); +// CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -67,9 +67,9 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } - CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff"); + //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); + //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); + //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff"); } @@ -80,7 +80,7 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, Forward_gpu_opt(bottom, top); else Forward_gpu_org(bottom, top); - CHECK_BLOB_DATA(top[0],20, "top[0]"); +// CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -160,7 +160,7 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - CHECK_BLOB_DATA(top[0],20, "top[0]"); + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -256,10 +256,10 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } } - CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); +// CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); +// CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); +// CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); + // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index a3cca01c..22456302 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -152,6 +152,7 @@ void SoftmaxWithLossLayer::Forward_gpu( outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); Dtype loss; caffe_gpu_asum(nthreads, loss_data, &loss); + printf("loss = %f\n", loss); if (normalize_) { Dtype count; caffe_gpu_asum(nthreads, counts, &count); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index ad6bdc7e..f5d0e703 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -511,21 +511,17 @@ Dtype Net::ForwardFromTo(int start, int end) { forward_timer.Start(); for (int i = start; i <= end; ++i) { - //double begin_time = GettickCount(); layer_timer.Start(); - //printf("Forwarding %s\n",layer_names_[i].c_str()); Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); loss += layer_loss; if (debug_info_) { ForwardDebugInfo(i); } clFinish(amdDevice.CommandQueue); - //double end_time = GettickCount(); layer_timer.Stop(); - //printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), end_time-begin_time); printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); } forward_timer.Stop(); - printf("Forward time: %f\n\n", forward_timer.MilliSeconds()); + printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); return loss; } @@ -587,22 +583,23 @@ void Net::BackwardFromTo(int start, int end) { CHECK_LT(start, layers_.size()); CPUTimer backward_timer; + CPUTimer layer_timer; backward_timer.Start(); for (int i = start; i >= end; --i) { + layer_timer.Start(); if (layer_need_backward_[i]) { -//Yibing add for porting - printf("Backwarding %s\n",layer_names_[i].c_str()); layers_[i]->Backward( top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); if (debug_info_) { BackwardDebugInfo(i); } -//Yibing add for porting clFinish(amdDevice.CommandQueue); + layer_timer.Start(); + printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); } } backward_timer.Stop(); - printf("Backward time: %f\n\n", backward_timer.MilliSeconds()); + printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); } template diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index d94efcba..b6a5a0a1 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -90,8 +90,8 @@ __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const } } -template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); -template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); +template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); +template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); template __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){ diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index 55026603..9710a343 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -34,6 +34,7 @@ __kernel void OCL_memset(__global T* buffer, const T value, const int size){ } } +template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size); template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 63c8294c..f4b57a41 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -53,7 +53,7 @@ void Solver::Init(const SolverParameter& param) { //#ifndef CPU_ONLY //AMD device related initialization - amdDevice.Init(); + //amdDevice.Init(); ocl_setup(); // cl_int err = clblasSetup(); //#else @@ -236,7 +236,9 @@ void Solver::Step(int iters) { int idx = (iter_ - start_iter) % average_loss; smoothed_loss += (loss - losses[idx]) / average_loss; losses[idx] = loss; + printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx); } + printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %d \n", smoothed_loss,average_loss, losses.size()); if (display) { LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); diff --git a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake new file mode 100644 index 00000000..7bb0014c --- /dev/null +++ b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake @@ -0,0 +1,16 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Relative path conversion top directories. +SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe") +SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe") + +# Force unix paths in dependencies. +SET(CMAKE_FORCE_UNIX_PATHS 1) + + +# The C and CXX include file regular expressions for this directory. +SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$") +SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$") +SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN}) +SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN}) diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake new file mode 100644 index 00000000..895d9fca --- /dev/null +++ b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake @@ -0,0 +1,296 @@ +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +########################################################################## +# This file runs the nvcc commands to produce the desired output file along with +# the dependency file needed by CMake to compute dependencies. In addition the +# file checks the output of each command and if the command fails it deletes the +# output files. + +# Input variables +# +# verbose:BOOL=<> OFF: Be as quiet as possible (default) +# ON : Describe each step +# +# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or +# RelWithDebInfo, but it should match one of the +# entries in CUDA_HOST_FLAGS. This is the build +# configuration used when compiling the code. If +# blank or unspecified Debug is assumed as this is +# what CMake does. +# +# generated_file:STRING=<> File to generate. This argument must be passed in. +# +# generated_cubin_file:STRING=<> File to generate. This argument must be passed +# in if build_cubin is true. + +if(NOT generated_file) + message(FATAL_ERROR "You must specify generated_file on the command line") +endif() + +# Set these up as variables to make reading the generated file easier +set(CMAKE_COMMAND "/usr/bin/cmake") # path +set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_kernel.cu") # path +set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.NVCC-depend") # path +set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.depend") # path +set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path +set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path +set(build_cubin OFF) # bool +set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool +# We won't actually use these variables for now, but we need to set this, in +# order to force this file to be run again if it changes. +set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.") # path +set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o") # path +set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt") # path + +set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path +set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC;-Xcompiler;-fPIC ;; ) # list +# Build specific configuration flags +set(CUDA_NVCC_FLAGS_DEBUG ; ) +set(CUDA_NVCC_FLAGS_RELEASE ; ) +set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) +set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list +set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). +set(format_flag "-c") # string + +if(build_cubin AND NOT generated_cubin_file) + message(FATAL_ERROR "You must specify generated_cubin_file on the command line") +endif() + +# This is the list of host compilation flags. It C or CXX should already have +# been chosen by FindCUDA.cmake. +set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) +set(CMAKE_HOST_FLAGS_DEBUG -g) +set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) +set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) +set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) + +# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler +set(nvcc_host_compiler_flags "") +# If we weren't given a build_configuration, use Debug. +if(NOT build_configuration) + set(build_configuration Debug) +endif() +string(TOUPPER "${build_configuration}" build_configuration) +#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") +foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) + # Extra quotes are added around each flag to help nvcc parse out flags with spaces. + set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") +endforeach() +if (nvcc_host_compiler_flags) + set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) +endif() +#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") +# Add the build specific configuration flags +list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) + +# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority +list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) +list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) +if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) + if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) + set(CCBIN -ccbin "${CCBIN}") + else() + set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") + endif() +endif() + +# cuda_execute_process - Executes a command with optional command echo and status message. +# +# status - Status message to print if verbose is true +# command - COMMAND argument from the usual execute_process argument structure +# ARGN - Remaining arguments are the command with arguments +# +# CUDA_result - return value from running the command +# +# Make this a macro instead of a function, so that things like RESULT_VARIABLE +# and other return variables are present after executing the process. +macro(cuda_execute_process status command) + set(_command ${command}) + if(NOT _command STREQUAL "COMMAND") + message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") + endif() + if(verbose) + execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) + # Now we need to build up our command string. We are accounting for quotes + # and spaces, anything else is left up to the user to fix if they want to + # copy and paste a runnable command line. + set(cuda_execute_process_string) + foreach(arg ${ARGN}) + # If there are quotes, excape them, so they come through. + string(REPLACE "\"" "\\\"" arg ${arg}) + # Args with spaces need quotes around them to get them to be parsed as a single argument. + if(arg MATCHES " ") + list(APPEND cuda_execute_process_string "\"${arg}\"") + else() + list(APPEND cuda_execute_process_string ${arg}) + endif() + endforeach() + # Echo the command + execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) + endif() + # Run the command + execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) +endmacro() + +# Delete the target file +cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + +# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag +# for dependency generation and hope for the best. +set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") +set(CUDA_VERSION 6.5) +if(CUDA_VERSION VERSION_LESS "3.0") + cmake_policy(PUSH) + # CMake policy 0007 NEW states that empty list elements are not + # ignored. I'm just setting it to avoid the warning that's printed. + cmake_policy(SET CMP0007 NEW) + # Note that this will remove all occurances of -G. + list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") + cmake_policy(POP) +endif() + +# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This +# can cause incorrect dependencies when #including files based on this macro which is +# defined in the generating passes of nvcc invokation. We will go ahead and manually +# define this for now until a future version fixes this bug. +set(CUDACC_DEFINE -D__CUDACC__) + +# Generate the dependency file +cuda_execute_process( + "Generating dependency file: ${NVCC_generated_dependency_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + -M + ${CUDACC_DEFINE} + "${source_file}" + -o "${NVCC_generated_dependency_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${depends_CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the cmake readable dependency file to a temp file. Don't put the +# quotes just around the filenames for the input_file and output_file variables. +# CMake will pass the quotes through and not be able to find the file. +cuda_execute_process( + "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" + -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" + -P "${CUDA_make2cmake}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Copy the file if it is different +cuda_execute_process( + "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Delete the temporary file +cuda_execute_process( + "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" + ) + +if(CUDA_result) + message(FATAL_ERROR "Error generating ${generated_file}") +endif() + +# Generate the code +cuda_execute_process( + "Generating ${generated_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${format_flag} -o "${generated_file}" + ${CCBIN} + ${nvcc_flags} + ${nvcc_host_compiler_flags} + ${CUDA_NVCC_FLAGS} + -DNVCC + ${CUDA_NVCC_INCLUDE_ARGS} + ) + +if(CUDA_result) + # Since nvcc can sometimes leave half done files make sure that we delete the output file. + cuda_execute_process( + "Removing ${generated_file}" + COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" + ) + message(FATAL_ERROR "Error generating file ${generated_file}") +else() + if(verbose) + message("Generated ${generated_file} successfully.") + endif() +endif() + +# Cubin resource report commands. +if( build_cubin ) + # Run with -cubin to produce resource usage report. + cuda_execute_process( + "Generating ${generated_cubin_file}" + COMMAND "${CUDA_NVCC_EXECUTABLE}" + "${source_file}" + ${CUDA_NVCC_FLAGS} + ${nvcc_flags} + ${CCBIN} + ${nvcc_host_compiler_flags} + -DNVCC + -cubin + -o "${generated_cubin_file}" + ${CUDA_NVCC_INCLUDE_ARGS} + ) + + # Execute the parser script. + cuda_execute_process( + "Executing the parser script" + COMMAND "${CMAKE_COMMAND}" + -D "input_file:STRING=${generated_cubin_file}" + -P "${CUDA_parse_cubin}" + ) + +endif() diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend new file mode 100644 index 00000000..8e3a0be1 --- /dev/null +++ b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend @@ -0,0 +1 @@ +#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/test/CMakeFiles/progress.marks b/src/caffe/test/CMakeFiles/progress.marks new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/src/caffe/test/CMakeFiles/progress.marks @@ -0,0 +1 @@ +0 diff --git a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake new file mode 100644 index 00000000..f660fadf --- /dev/null +++ b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake @@ -0,0 +1,27 @@ +# The set of languages for which implicit dependencies are needed: +SET(CMAKE_DEPENDS_LANGUAGES + ) +# The set of files for implicit dependencies of each language: + +# Preprocessor definitions for this target. +SET(CMAKE_TARGET_DEFINITIONS + "GTEST_USE_OWN_TR1_TUPLE" + ) + +# Targets to which this target links. +SET(CMAKE_TARGET_LINKED_INFO_FILES + ) + +# The include file search paths: +SET(CMAKE_C_TARGET_INCLUDE_PATH + "src" + "/usr/local/include" + "include" + "/usr/local/cuda/include" + "/usr/local/include/opencv" + "/usr/include/atlas" + "." + ) +SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/test/CMakeFiles/runtest.dir/build.make b/src/caffe/test/CMakeFiles/runtest.dir/build.make new file mode 100644 index 00000000..7ccc5279 --- /dev/null +++ b/src/caffe/test/CMakeFiles/runtest.dir/build.make @@ -0,0 +1,69 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# Utility rule file for runtest. + +# Include the progress variables for this target. +include src/caffe/test/CMakeFiles/runtest.dir/progress.make + +src/caffe/test/CMakeFiles/runtest: + /home/yugao/caffe-merge-junli/caffe-yb/caffe/test/test.testbin --gtest_shuffle + +runtest: src/caffe/test/CMakeFiles/runtest +runtest: src/caffe/test/CMakeFiles/runtest.dir/build.make +.PHONY : runtest + +# Rule to build all files generated by this target. +src/caffe/test/CMakeFiles/runtest.dir/build: runtest +.PHONY : src/caffe/test/CMakeFiles/runtest.dir/build + +src/caffe/test/CMakeFiles/runtest.dir/clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/runtest.dir/cmake_clean.cmake +.PHONY : src/caffe/test/CMakeFiles/runtest.dir/clean + +src/caffe/test/CMakeFiles/runtest.dir/depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake --color=$(COLOR) +.PHONY : src/caffe/test/CMakeFiles/runtest.dir/depend + diff --git a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake new file mode 100644 index 00000000..ed560e60 --- /dev/null +++ b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake @@ -0,0 +1,8 @@ +FILE(REMOVE_RECURSE + "CMakeFiles/runtest" +) + +# Per-language clean rules from dependency scanning. +FOREACH(lang) + INCLUDE(CMakeFiles/runtest.dir/cmake_clean_${lang}.cmake OPTIONAL) +ENDFOREACH(lang) diff --git a/src/caffe/test/CMakeFiles/runtest.dir/progress.make b/src/caffe/test/CMakeFiles/runtest.dir/progress.make new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/caffe/test/CMakeFiles/runtest.dir/progress.make @@ -0,0 +1 @@ + diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake new file mode 100644 index 00000000..d4748b21 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake @@ -0,0 +1,92 @@ +# The set of languages for which implicit dependencies are needed: +SET(CMAKE_DEPENDS_LANGUAGES + "CXX" + ) +# The set of files for implicit dependencies of each language: +SET(CMAKE_DEPENDS_CHECK_CXX + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" + ) +SET(CMAKE_CXX_COMPILER_ID "GNU") + +# Preprocessor definitions for this target. +SET(CMAKE_TARGET_DEFINITIONS + "GTEST_USE_OWN_TR1_TUPLE" + ) + +# Targets to which this target links. +SET(CMAKE_TARGET_LINKED_INFO_FILES + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake" + ) + +# The include file search paths: +SET(CMAKE_C_TARGET_INCLUDE_PATH + "src" + "/usr/local/include" + "include" + "/usr/local/cuda/include" + "/usr/local/include/opencv" + "/usr/include/atlas" + "." + ) +SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make new file mode 100644 index 00000000..c67def36 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make @@ -0,0 +1,1623 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# Include any dependencies generated for this target. +include src/caffe/test/CMakeFiles/test.testbin.dir/depend.make + +# Include the progress variables for this target. +include src/caffe/test/CMakeFiles/test.testbin.dir/progress.make + +# Include the compile flags for this target's objects. +include src/caffe/test/CMakeFiles/test.testbin.dir/flags.make + +src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend +src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake +src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/test_im2col_kernel.cu + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//. + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.cmake + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/test_spp_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp > CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/test_filler.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filler.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filler.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp > CMakeFiles/test.testbin.dir/test_filler.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filler.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp -o CMakeFiles/test.testbin.dir/test_filler.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/test_im2col_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp > CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/test_common.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_common.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp > CMakeFiles/test.testbin.dir/test_common.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_common.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp -o CMakeFiles/test.testbin.dir/test_common.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/test_infogain_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/test_math_functions.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_math_functions.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp > CMakeFiles/test.testbin.dir/test_math_functions.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_math_functions.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/test_euclidean_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/test_split_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_split_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp > CMakeFiles/test.testbin.dir/test_split_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_split_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/test_reshape_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp > CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/test_random_number_generator.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp > CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/test_lrn_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp > CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/test_gradient_based_solver.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp > CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/test_upgrade_proto.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp > CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/test_io.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_io.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp > CMakeFiles/test.testbin.dir/test_io.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_io.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp -o CMakeFiles/test.testbin.dir/test_io.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/test_accuracy_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp > CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/test_caffe_main.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp > CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/test_net.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_net.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp > CMakeFiles/test.testbin.dir/test_net.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_net.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp -o CMakeFiles/test.testbin.dir/test_net.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/test_filter_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp > CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/test_power_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_power_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp > CMakeFiles/test.testbin.dir/test_power_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_power_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/test_softmax_with_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/test_argmax_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp > CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/test_solver.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_solver.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp > CMakeFiles/test.testbin.dir/test_solver.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_solver.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp -o CMakeFiles/test.testbin.dir/test_solver.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/test_blob.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_blob.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp > CMakeFiles/test.testbin.dir/test_blob.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_blob.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp -o CMakeFiles/test.testbin.dir/test_blob.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/test_benchmark.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_benchmark.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp > CMakeFiles/test.testbin.dir/test_benchmark.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_benchmark.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/test_multinomial_logistic_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/test_util_blas.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_util_blas.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp > CMakeFiles/test.testbin.dir/test_util_blas.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_util_blas.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/test_internal_thread.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp > CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/test_reduction_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp > CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/test_contrastive_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/test_eltwise_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp > CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/test_maxpool_dropout_layers.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp > CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/test_threshold_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp > CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/test_pooling_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp > CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/test_softmax_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/test_inner_product_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp > CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/test_flatten_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp > CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/test_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp > CMakeFiles/test.testbin.dir/test_data_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/test_syncedmem.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp > CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/test_hdf5data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/test_deconvolution_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp > CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/test_neuron_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp > CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/test_concat_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp > CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/test_protobuf.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_protobuf.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp > CMakeFiles/test.testbin.dir/test_protobuf.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_protobuf.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/test_hdf5_output_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/test_memory_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp > CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/test_tanh_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp > CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/test_stochastic_pooling.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp > CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/test_dummy_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp > CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/test_layer_factory.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp > CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/test_db.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_db.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp > CMakeFiles/test.testbin.dir/test_db.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_db.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp -o CMakeFiles/test.testbin.dir/test_db.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/test_mvn_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp > CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/test_convolution_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp > CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/test_slice_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp > CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/test_hinge_loss_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/test_image_data_layer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp > CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/test_platform.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_platform.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_platform.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp > CMakeFiles/test.testbin.dir/test_platform.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_platform.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp -o CMakeFiles/test.testbin.dir/test_platform.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/test_data_transformer.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp > CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires: +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires + $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides + +src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o + +# Object files for target test.testbin +test_testbin_OBJECTS = \ +"CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_filler.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_common.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_io.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_net.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_solver.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_blob.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_db.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_platform.cpp.o" \ +"CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" + +# External object files for target test.testbin +test_testbin_EXTERNAL_OBJECTS = \ +"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o" + +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o +test/test.testbin: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/build.make +test/test.testbin: lib/libgtest.a +test/test.testbin: lib/libcaffe.so +test/test.testbin: lib/libproto.a +test/test.testbin: /usr/local/lib/libboost_system.so +test/test.testbin: /usr/local/lib/libboost_thread.so +test/test.testbin: /usr/lib/x86_64-linux-gnu/libpthread.so +test/test.testbin: /usr/local/lib/libglog.so +test/test.testbin: /usr/local/lib/libgflags.a +test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so +test/test.testbin: /usr/local/lib/libglog.so +test/test.testbin: /usr/local/lib/libgflags.a +test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so +test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so +test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5.so +test/test.testbin: /usr/local/lib/liblmdb.so +test/test.testbin: /usr/lib/x86_64-linux-gnu/libleveldb.so +test/test.testbin: /usr/lib/libsnappy.so +test/test.testbin: /usr/local/cuda/lib64/libcudart.so +test/test.testbin: /usr/local/cuda/lib64/libcurand.so +test/test.testbin: /usr/local/cuda/lib64/libcublas.so +test/test.testbin: /usr/local/lib/libopencv_highgui.so.2.4.10 +test/test.testbin: /usr/local/lib/libopencv_imgproc.so.2.4.10 +test/test.testbin: /usr/local/lib/libopencv_core.so.2.4.10 +test/test.testbin: /usr/lib/liblapack_atlas.so +test/test.testbin: /usr/lib/libcblas.so +test/test.testbin: /usr/lib/libatlas.so +test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/link.txt + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX executable ../../../test/test.testbin" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/test.testbin.dir/link.txt --verbose=$(VERBOSE) + +# Rule to build all files generated by this target. +src/caffe/test/CMakeFiles/test.testbin.dir/build: test/test.testbin +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/build + +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires +src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/requires + +src/caffe/test/CMakeFiles/test.testbin.dir/clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/test.testbin.dir/cmake_clean.cmake +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/clean + +src/caffe/test/CMakeFiles/test.testbin.dir/depend: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake --color=$(COLOR) +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/depend + diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake new file mode 100644 index 00000000..3270b673 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake @@ -0,0 +1,68 @@ +FILE(REMOVE_RECURSE + "CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o" + "CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_filler.cpp.o" + "CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_common.cpp.o" + "CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" + "CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" + "CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" + "CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" + "CMakeFiles/test.testbin.dir/test_io.cpp.o" + "CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" + "CMakeFiles/test.testbin.dir/test_net.cpp.o" + "CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_solver.cpp.o" + "CMakeFiles/test.testbin.dir/test_blob.cpp.o" + "CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" + "CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" + "CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" + "CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" + "CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" + "CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" + "CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" + "CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" + "CMakeFiles/test.testbin.dir/test_db.cpp.o" + "CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" + "CMakeFiles/test.testbin.dir/test_platform.cpp.o" + "CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" + "../../../test/test.testbin.pdb" + "../../../test/test.testbin" +) + +# Per-language clean rules from dependency scanning. +FOREACH(lang CXX) + INCLUDE(CMakeFiles/test.testbin.dir/cmake_clean_${lang}.cmake OPTIONAL) +ENDFOREACH(lang) diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make new file mode 100644 index 00000000..e3607644 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make @@ -0,0 +1,2 @@ +# Empty dependencies file for test.testbin. +# This may be replaced when dependencies are built. diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make new file mode 100644 index 00000000..8b4ef992 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make @@ -0,0 +1,8 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# compile CXX with /usr/bin/c++ +CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe + +CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE + diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt new file mode 100644 index 00000000..35426fa4 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt @@ -0,0 +1 @@ +/usr/bin/c++ -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o CMakeFiles/test.testbin.dir/test_filler.cpp.o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o CMakeFiles/test.testbin.dir/test_common.cpp.o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o CMakeFiles/test.testbin.dir/test_io.cpp.o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o CMakeFiles/test.testbin.dir/test_net.cpp.o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_solver.cpp.o CMakeFiles/test.testbin.dir/test_blob.cpp.o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o CMakeFiles/test.testbin.dir/test_db.cpp.o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_platform.cpp.o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o -o ../../../test/test.testbin -L/usr/local/cuda/lib64 -L/usr/local/lib -rdynamic ../../../lib/libgtest.a -Wl,--whole-archive ../../../lib/libcaffe.so -Wl,--no-whole-archive ../../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 /usr/local/lib/libopencv_core.so.2.4.10 -llapack_atlas -lcblas -latlas -Wl,-rpath,/usr/local/cuda/lib64:/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib:/usr/local/lib diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make new file mode 100644 index 00000000..9de70a55 --- /dev/null +++ b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make @@ -0,0 +1,60 @@ +CMAKE_PROGRESS_1 = +CMAKE_PROGRESS_2 = 69 +CMAKE_PROGRESS_3 = +CMAKE_PROGRESS_4 = 70 +CMAKE_PROGRESS_5 = +CMAKE_PROGRESS_6 = 71 +CMAKE_PROGRESS_7 = +CMAKE_PROGRESS_8 = 72 +CMAKE_PROGRESS_9 = +CMAKE_PROGRESS_10 = 73 +CMAKE_PROGRESS_11 = +CMAKE_PROGRESS_12 = 74 +CMAKE_PROGRESS_13 = +CMAKE_PROGRESS_14 = 75 +CMAKE_PROGRESS_15 = +CMAKE_PROGRESS_16 = 76 +CMAKE_PROGRESS_17 = +CMAKE_PROGRESS_18 = 77 +CMAKE_PROGRESS_19 = +CMAKE_PROGRESS_20 = 78 +CMAKE_PROGRESS_21 = +CMAKE_PROGRESS_22 = 79 +CMAKE_PROGRESS_23 = +CMAKE_PROGRESS_24 = 80 +CMAKE_PROGRESS_25 = +CMAKE_PROGRESS_26 = 81 +CMAKE_PROGRESS_27 = +CMAKE_PROGRESS_28 = 82 +CMAKE_PROGRESS_29 = +CMAKE_PROGRESS_30 = 83 +CMAKE_PROGRESS_31 = +CMAKE_PROGRESS_32 = 84 +CMAKE_PROGRESS_33 = +CMAKE_PROGRESS_34 = 85 +CMAKE_PROGRESS_35 = +CMAKE_PROGRESS_36 = 86 +CMAKE_PROGRESS_37 = +CMAKE_PROGRESS_38 = 87 +CMAKE_PROGRESS_39 = +CMAKE_PROGRESS_40 = 88 +CMAKE_PROGRESS_41 = +CMAKE_PROGRESS_42 = 89 +CMAKE_PROGRESS_43 = +CMAKE_PROGRESS_44 = 90 +CMAKE_PROGRESS_45 = +CMAKE_PROGRESS_46 = 91 +CMAKE_PROGRESS_47 = +CMAKE_PROGRESS_48 = 92 +CMAKE_PROGRESS_49 = +CMAKE_PROGRESS_50 = 93 +CMAKE_PROGRESS_51 = +CMAKE_PROGRESS_52 = 94 +CMAKE_PROGRESS_53 = +CMAKE_PROGRESS_54 = 95 +CMAKE_PROGRESS_55 = +CMAKE_PROGRESS_56 = 96 +CMAKE_PROGRESS_57 = +CMAKE_PROGRESS_58 = 97 +CMAKE_PROGRESS_59 = + diff --git a/src/caffe/test/Makefile b/src/caffe/test/Makefile new file mode 100644 index 00000000..c9e785c7 --- /dev/null +++ b/src/caffe/test/Makefile @@ -0,0 +1,1766 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: install/local +.PHONY : install/local/fast + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: install/strip +.PHONY : install/strip/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/progress.marks + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/all + $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/caffe/test/CMakeFiles/runtest.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/runtest.dir/rule +.PHONY : src/caffe/test/CMakeFiles/runtest.dir/rule + +# Convenience name for target. +runtest: src/caffe/test/CMakeFiles/runtest.dir/rule +.PHONY : runtest + +# fast build rule for target. +runtest/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/runtest.dir/build.make src/caffe/test/CMakeFiles/runtest.dir/build +.PHONY : runtest/fast + +# Convenience name for target. +src/caffe/test/CMakeFiles/test.testbin.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/test.testbin.dir/rule +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/rule + +# Convenience name for target. +test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/rule +.PHONY : test.testbin + +# fast build rule for target. +test.testbin/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/build +.PHONY : test.testbin/fast + +test_accuracy_layer.o: test_accuracy_layer.cpp.o +.PHONY : test_accuracy_layer.o + +# target to build an object file +test_accuracy_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o +.PHONY : test_accuracy_layer.cpp.o + +test_accuracy_layer.i: test_accuracy_layer.cpp.i +.PHONY : test_accuracy_layer.i + +# target to preprocess a source file +test_accuracy_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i +.PHONY : test_accuracy_layer.cpp.i + +test_accuracy_layer.s: test_accuracy_layer.cpp.s +.PHONY : test_accuracy_layer.s + +# target to generate assembly for a file +test_accuracy_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s +.PHONY : test_accuracy_layer.cpp.s + +test_argmax_layer.o: test_argmax_layer.cpp.o +.PHONY : test_argmax_layer.o + +# target to build an object file +test_argmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o +.PHONY : test_argmax_layer.cpp.o + +test_argmax_layer.i: test_argmax_layer.cpp.i +.PHONY : test_argmax_layer.i + +# target to preprocess a source file +test_argmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i +.PHONY : test_argmax_layer.cpp.i + +test_argmax_layer.s: test_argmax_layer.cpp.s +.PHONY : test_argmax_layer.s + +# target to generate assembly for a file +test_argmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s +.PHONY : test_argmax_layer.cpp.s + +test_benchmark.o: test_benchmark.cpp.o +.PHONY : test_benchmark.o + +# target to build an object file +test_benchmark.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o +.PHONY : test_benchmark.cpp.o + +test_benchmark.i: test_benchmark.cpp.i +.PHONY : test_benchmark.i + +# target to preprocess a source file +test_benchmark.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i +.PHONY : test_benchmark.cpp.i + +test_benchmark.s: test_benchmark.cpp.s +.PHONY : test_benchmark.s + +# target to generate assembly for a file +test_benchmark.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s +.PHONY : test_benchmark.cpp.s + +test_blob.o: test_blob.cpp.o +.PHONY : test_blob.o + +# target to build an object file +test_blob.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o +.PHONY : test_blob.cpp.o + +test_blob.i: test_blob.cpp.i +.PHONY : test_blob.i + +# target to preprocess a source file +test_blob.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i +.PHONY : test_blob.cpp.i + +test_blob.s: test_blob.cpp.s +.PHONY : test_blob.s + +# target to generate assembly for a file +test_blob.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s +.PHONY : test_blob.cpp.s + +test_caffe_main.o: test_caffe_main.cpp.o +.PHONY : test_caffe_main.o + +# target to build an object file +test_caffe_main.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o +.PHONY : test_caffe_main.cpp.o + +test_caffe_main.i: test_caffe_main.cpp.i +.PHONY : test_caffe_main.i + +# target to preprocess a source file +test_caffe_main.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i +.PHONY : test_caffe_main.cpp.i + +test_caffe_main.s: test_caffe_main.cpp.s +.PHONY : test_caffe_main.s + +# target to generate assembly for a file +test_caffe_main.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s +.PHONY : test_caffe_main.cpp.s + +test_common.o: test_common.cpp.o +.PHONY : test_common.o + +# target to build an object file +test_common.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o +.PHONY : test_common.cpp.o + +test_common.i: test_common.cpp.i +.PHONY : test_common.i + +# target to preprocess a source file +test_common.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i +.PHONY : test_common.cpp.i + +test_common.s: test_common.cpp.s +.PHONY : test_common.s + +# target to generate assembly for a file +test_common.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s +.PHONY : test_common.cpp.s + +test_concat_layer.o: test_concat_layer.cpp.o +.PHONY : test_concat_layer.o + +# target to build an object file +test_concat_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o +.PHONY : test_concat_layer.cpp.o + +test_concat_layer.i: test_concat_layer.cpp.i +.PHONY : test_concat_layer.i + +# target to preprocess a source file +test_concat_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i +.PHONY : test_concat_layer.cpp.i + +test_concat_layer.s: test_concat_layer.cpp.s +.PHONY : test_concat_layer.s + +# target to generate assembly for a file +test_concat_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s +.PHONY : test_concat_layer.cpp.s + +test_contrastive_loss_layer.o: test_contrastive_loss_layer.cpp.o +.PHONY : test_contrastive_loss_layer.o + +# target to build an object file +test_contrastive_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o +.PHONY : test_contrastive_loss_layer.cpp.o + +test_contrastive_loss_layer.i: test_contrastive_loss_layer.cpp.i +.PHONY : test_contrastive_loss_layer.i + +# target to preprocess a source file +test_contrastive_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i +.PHONY : test_contrastive_loss_layer.cpp.i + +test_contrastive_loss_layer.s: test_contrastive_loss_layer.cpp.s +.PHONY : test_contrastive_loss_layer.s + +# target to generate assembly for a file +test_contrastive_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s +.PHONY : test_contrastive_loss_layer.cpp.s + +test_convolution_layer.o: test_convolution_layer.cpp.o +.PHONY : test_convolution_layer.o + +# target to build an object file +test_convolution_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o +.PHONY : test_convolution_layer.cpp.o + +test_convolution_layer.i: test_convolution_layer.cpp.i +.PHONY : test_convolution_layer.i + +# target to preprocess a source file +test_convolution_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i +.PHONY : test_convolution_layer.cpp.i + +test_convolution_layer.s: test_convolution_layer.cpp.s +.PHONY : test_convolution_layer.s + +# target to generate assembly for a file +test_convolution_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s +.PHONY : test_convolution_layer.cpp.s + +test_data_layer.o: test_data_layer.cpp.o +.PHONY : test_data_layer.o + +# target to build an object file +test_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o +.PHONY : test_data_layer.cpp.o + +test_data_layer.i: test_data_layer.cpp.i +.PHONY : test_data_layer.i + +# target to preprocess a source file +test_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i +.PHONY : test_data_layer.cpp.i + +test_data_layer.s: test_data_layer.cpp.s +.PHONY : test_data_layer.s + +# target to generate assembly for a file +test_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s +.PHONY : test_data_layer.cpp.s + +test_data_transformer.o: test_data_transformer.cpp.o +.PHONY : test_data_transformer.o + +# target to build an object file +test_data_transformer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o +.PHONY : test_data_transformer.cpp.o + +test_data_transformer.i: test_data_transformer.cpp.i +.PHONY : test_data_transformer.i + +# target to preprocess a source file +test_data_transformer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i +.PHONY : test_data_transformer.cpp.i + +test_data_transformer.s: test_data_transformer.cpp.s +.PHONY : test_data_transformer.s + +# target to generate assembly for a file +test_data_transformer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s +.PHONY : test_data_transformer.cpp.s + +test_db.o: test_db.cpp.o +.PHONY : test_db.o + +# target to build an object file +test_db.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o +.PHONY : test_db.cpp.o + +test_db.i: test_db.cpp.i +.PHONY : test_db.i + +# target to preprocess a source file +test_db.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i +.PHONY : test_db.cpp.i + +test_db.s: test_db.cpp.s +.PHONY : test_db.s + +# target to generate assembly for a file +test_db.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s +.PHONY : test_db.cpp.s + +test_deconvolution_layer.o: test_deconvolution_layer.cpp.o +.PHONY : test_deconvolution_layer.o + +# target to build an object file +test_deconvolution_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o +.PHONY : test_deconvolution_layer.cpp.o + +test_deconvolution_layer.i: test_deconvolution_layer.cpp.i +.PHONY : test_deconvolution_layer.i + +# target to preprocess a source file +test_deconvolution_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i +.PHONY : test_deconvolution_layer.cpp.i + +test_deconvolution_layer.s: test_deconvolution_layer.cpp.s +.PHONY : test_deconvolution_layer.s + +# target to generate assembly for a file +test_deconvolution_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s +.PHONY : test_deconvolution_layer.cpp.s + +test_dummy_data_layer.o: test_dummy_data_layer.cpp.o +.PHONY : test_dummy_data_layer.o + +# target to build an object file +test_dummy_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o +.PHONY : test_dummy_data_layer.cpp.o + +test_dummy_data_layer.i: test_dummy_data_layer.cpp.i +.PHONY : test_dummy_data_layer.i + +# target to preprocess a source file +test_dummy_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i +.PHONY : test_dummy_data_layer.cpp.i + +test_dummy_data_layer.s: test_dummy_data_layer.cpp.s +.PHONY : test_dummy_data_layer.s + +# target to generate assembly for a file +test_dummy_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s +.PHONY : test_dummy_data_layer.cpp.s + +test_eltwise_layer.o: test_eltwise_layer.cpp.o +.PHONY : test_eltwise_layer.o + +# target to build an object file +test_eltwise_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o +.PHONY : test_eltwise_layer.cpp.o + +test_eltwise_layer.i: test_eltwise_layer.cpp.i +.PHONY : test_eltwise_layer.i + +# target to preprocess a source file +test_eltwise_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i +.PHONY : test_eltwise_layer.cpp.i + +test_eltwise_layer.s: test_eltwise_layer.cpp.s +.PHONY : test_eltwise_layer.s + +# target to generate assembly for a file +test_eltwise_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s +.PHONY : test_eltwise_layer.cpp.s + +test_euclidean_loss_layer.o: test_euclidean_loss_layer.cpp.o +.PHONY : test_euclidean_loss_layer.o + +# target to build an object file +test_euclidean_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o +.PHONY : test_euclidean_loss_layer.cpp.o + +test_euclidean_loss_layer.i: test_euclidean_loss_layer.cpp.i +.PHONY : test_euclidean_loss_layer.i + +# target to preprocess a source file +test_euclidean_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i +.PHONY : test_euclidean_loss_layer.cpp.i + +test_euclidean_loss_layer.s: test_euclidean_loss_layer.cpp.s +.PHONY : test_euclidean_loss_layer.s + +# target to generate assembly for a file +test_euclidean_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s +.PHONY : test_euclidean_loss_layer.cpp.s + +test_filler.o: test_filler.cpp.o +.PHONY : test_filler.o + +# target to build an object file +test_filler.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o +.PHONY : test_filler.cpp.o + +test_filler.i: test_filler.cpp.i +.PHONY : test_filler.i + +# target to preprocess a source file +test_filler.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i +.PHONY : test_filler.cpp.i + +test_filler.s: test_filler.cpp.s +.PHONY : test_filler.s + +# target to generate assembly for a file +test_filler.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s +.PHONY : test_filler.cpp.s + +test_filter_layer.o: test_filter_layer.cpp.o +.PHONY : test_filter_layer.o + +# target to build an object file +test_filter_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o +.PHONY : test_filter_layer.cpp.o + +test_filter_layer.i: test_filter_layer.cpp.i +.PHONY : test_filter_layer.i + +# target to preprocess a source file +test_filter_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i +.PHONY : test_filter_layer.cpp.i + +test_filter_layer.s: test_filter_layer.cpp.s +.PHONY : test_filter_layer.s + +# target to generate assembly for a file +test_filter_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s +.PHONY : test_filter_layer.cpp.s + +test_flatten_layer.o: test_flatten_layer.cpp.o +.PHONY : test_flatten_layer.o + +# target to build an object file +test_flatten_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o +.PHONY : test_flatten_layer.cpp.o + +test_flatten_layer.i: test_flatten_layer.cpp.i +.PHONY : test_flatten_layer.i + +# target to preprocess a source file +test_flatten_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i +.PHONY : test_flatten_layer.cpp.i + +test_flatten_layer.s: test_flatten_layer.cpp.s +.PHONY : test_flatten_layer.s + +# target to generate assembly for a file +test_flatten_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s +.PHONY : test_flatten_layer.cpp.s + +test_gradient_based_solver.o: test_gradient_based_solver.cpp.o +.PHONY : test_gradient_based_solver.o + +# target to build an object file +test_gradient_based_solver.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o +.PHONY : test_gradient_based_solver.cpp.o + +test_gradient_based_solver.i: test_gradient_based_solver.cpp.i +.PHONY : test_gradient_based_solver.i + +# target to preprocess a source file +test_gradient_based_solver.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i +.PHONY : test_gradient_based_solver.cpp.i + +test_gradient_based_solver.s: test_gradient_based_solver.cpp.s +.PHONY : test_gradient_based_solver.s + +# target to generate assembly for a file +test_gradient_based_solver.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s +.PHONY : test_gradient_based_solver.cpp.s + +test_hdf5_output_layer.o: test_hdf5_output_layer.cpp.o +.PHONY : test_hdf5_output_layer.o + +# target to build an object file +test_hdf5_output_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o +.PHONY : test_hdf5_output_layer.cpp.o + +test_hdf5_output_layer.i: test_hdf5_output_layer.cpp.i +.PHONY : test_hdf5_output_layer.i + +# target to preprocess a source file +test_hdf5_output_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i +.PHONY : test_hdf5_output_layer.cpp.i + +test_hdf5_output_layer.s: test_hdf5_output_layer.cpp.s +.PHONY : test_hdf5_output_layer.s + +# target to generate assembly for a file +test_hdf5_output_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s +.PHONY : test_hdf5_output_layer.cpp.s + +test_hdf5data_layer.o: test_hdf5data_layer.cpp.o +.PHONY : test_hdf5data_layer.o + +# target to build an object file +test_hdf5data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o +.PHONY : test_hdf5data_layer.cpp.o + +test_hdf5data_layer.i: test_hdf5data_layer.cpp.i +.PHONY : test_hdf5data_layer.i + +# target to preprocess a source file +test_hdf5data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i +.PHONY : test_hdf5data_layer.cpp.i + +test_hdf5data_layer.s: test_hdf5data_layer.cpp.s +.PHONY : test_hdf5data_layer.s + +# target to generate assembly for a file +test_hdf5data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s +.PHONY : test_hdf5data_layer.cpp.s + +test_hinge_loss_layer.o: test_hinge_loss_layer.cpp.o +.PHONY : test_hinge_loss_layer.o + +# target to build an object file +test_hinge_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o +.PHONY : test_hinge_loss_layer.cpp.o + +test_hinge_loss_layer.i: test_hinge_loss_layer.cpp.i +.PHONY : test_hinge_loss_layer.i + +# target to preprocess a source file +test_hinge_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i +.PHONY : test_hinge_loss_layer.cpp.i + +test_hinge_loss_layer.s: test_hinge_loss_layer.cpp.s +.PHONY : test_hinge_loss_layer.s + +# target to generate assembly for a file +test_hinge_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s +.PHONY : test_hinge_loss_layer.cpp.s + +test_im2col_layer.o: test_im2col_layer.cpp.o +.PHONY : test_im2col_layer.o + +# target to build an object file +test_im2col_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o +.PHONY : test_im2col_layer.cpp.o + +test_im2col_layer.i: test_im2col_layer.cpp.i +.PHONY : test_im2col_layer.i + +# target to preprocess a source file +test_im2col_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i +.PHONY : test_im2col_layer.cpp.i + +test_im2col_layer.s: test_im2col_layer.cpp.s +.PHONY : test_im2col_layer.s + +# target to generate assembly for a file +test_im2col_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s +.PHONY : test_im2col_layer.cpp.s + +test_image_data_layer.o: test_image_data_layer.cpp.o +.PHONY : test_image_data_layer.o + +# target to build an object file +test_image_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o +.PHONY : test_image_data_layer.cpp.o + +test_image_data_layer.i: test_image_data_layer.cpp.i +.PHONY : test_image_data_layer.i + +# target to preprocess a source file +test_image_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i +.PHONY : test_image_data_layer.cpp.i + +test_image_data_layer.s: test_image_data_layer.cpp.s +.PHONY : test_image_data_layer.s + +# target to generate assembly for a file +test_image_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s +.PHONY : test_image_data_layer.cpp.s + +test_infogain_loss_layer.o: test_infogain_loss_layer.cpp.o +.PHONY : test_infogain_loss_layer.o + +# target to build an object file +test_infogain_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o +.PHONY : test_infogain_loss_layer.cpp.o + +test_infogain_loss_layer.i: test_infogain_loss_layer.cpp.i +.PHONY : test_infogain_loss_layer.i + +# target to preprocess a source file +test_infogain_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i +.PHONY : test_infogain_loss_layer.cpp.i + +test_infogain_loss_layer.s: test_infogain_loss_layer.cpp.s +.PHONY : test_infogain_loss_layer.s + +# target to generate assembly for a file +test_infogain_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s +.PHONY : test_infogain_loss_layer.cpp.s + +test_inner_product_layer.o: test_inner_product_layer.cpp.o +.PHONY : test_inner_product_layer.o + +# target to build an object file +test_inner_product_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o +.PHONY : test_inner_product_layer.cpp.o + +test_inner_product_layer.i: test_inner_product_layer.cpp.i +.PHONY : test_inner_product_layer.i + +# target to preprocess a source file +test_inner_product_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i +.PHONY : test_inner_product_layer.cpp.i + +test_inner_product_layer.s: test_inner_product_layer.cpp.s +.PHONY : test_inner_product_layer.s + +# target to generate assembly for a file +test_inner_product_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s +.PHONY : test_inner_product_layer.cpp.s + +test_internal_thread.o: test_internal_thread.cpp.o +.PHONY : test_internal_thread.o + +# target to build an object file +test_internal_thread.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o +.PHONY : test_internal_thread.cpp.o + +test_internal_thread.i: test_internal_thread.cpp.i +.PHONY : test_internal_thread.i + +# target to preprocess a source file +test_internal_thread.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i +.PHONY : test_internal_thread.cpp.i + +test_internal_thread.s: test_internal_thread.cpp.s +.PHONY : test_internal_thread.s + +# target to generate assembly for a file +test_internal_thread.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s +.PHONY : test_internal_thread.cpp.s + +test_io.o: test_io.cpp.o +.PHONY : test_io.o + +# target to build an object file +test_io.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o +.PHONY : test_io.cpp.o + +test_io.i: test_io.cpp.i +.PHONY : test_io.i + +# target to preprocess a source file +test_io.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i +.PHONY : test_io.cpp.i + +test_io.s: test_io.cpp.s +.PHONY : test_io.s + +# target to generate assembly for a file +test_io.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s +.PHONY : test_io.cpp.s + +test_layer_factory.o: test_layer_factory.cpp.o +.PHONY : test_layer_factory.o + +# target to build an object file +test_layer_factory.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o +.PHONY : test_layer_factory.cpp.o + +test_layer_factory.i: test_layer_factory.cpp.i +.PHONY : test_layer_factory.i + +# target to preprocess a source file +test_layer_factory.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i +.PHONY : test_layer_factory.cpp.i + +test_layer_factory.s: test_layer_factory.cpp.s +.PHONY : test_layer_factory.s + +# target to generate assembly for a file +test_layer_factory.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s +.PHONY : test_layer_factory.cpp.s + +test_lrn_layer.o: test_lrn_layer.cpp.o +.PHONY : test_lrn_layer.o + +# target to build an object file +test_lrn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o +.PHONY : test_lrn_layer.cpp.o + +test_lrn_layer.i: test_lrn_layer.cpp.i +.PHONY : test_lrn_layer.i + +# target to preprocess a source file +test_lrn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i +.PHONY : test_lrn_layer.cpp.i + +test_lrn_layer.s: test_lrn_layer.cpp.s +.PHONY : test_lrn_layer.s + +# target to generate assembly for a file +test_lrn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s +.PHONY : test_lrn_layer.cpp.s + +test_math_functions.o: test_math_functions.cpp.o +.PHONY : test_math_functions.o + +# target to build an object file +test_math_functions.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o +.PHONY : test_math_functions.cpp.o + +test_math_functions.i: test_math_functions.cpp.i +.PHONY : test_math_functions.i + +# target to preprocess a source file +test_math_functions.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i +.PHONY : test_math_functions.cpp.i + +test_math_functions.s: test_math_functions.cpp.s +.PHONY : test_math_functions.s + +# target to generate assembly for a file +test_math_functions.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s +.PHONY : test_math_functions.cpp.s + +test_maxpool_dropout_layers.o: test_maxpool_dropout_layers.cpp.o +.PHONY : test_maxpool_dropout_layers.o + +# target to build an object file +test_maxpool_dropout_layers.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o +.PHONY : test_maxpool_dropout_layers.cpp.o + +test_maxpool_dropout_layers.i: test_maxpool_dropout_layers.cpp.i +.PHONY : test_maxpool_dropout_layers.i + +# target to preprocess a source file +test_maxpool_dropout_layers.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i +.PHONY : test_maxpool_dropout_layers.cpp.i + +test_maxpool_dropout_layers.s: test_maxpool_dropout_layers.cpp.s +.PHONY : test_maxpool_dropout_layers.s + +# target to generate assembly for a file +test_maxpool_dropout_layers.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s +.PHONY : test_maxpool_dropout_layers.cpp.s + +test_memory_data_layer.o: test_memory_data_layer.cpp.o +.PHONY : test_memory_data_layer.o + +# target to build an object file +test_memory_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o +.PHONY : test_memory_data_layer.cpp.o + +test_memory_data_layer.i: test_memory_data_layer.cpp.i +.PHONY : test_memory_data_layer.i + +# target to preprocess a source file +test_memory_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i +.PHONY : test_memory_data_layer.cpp.i + +test_memory_data_layer.s: test_memory_data_layer.cpp.s +.PHONY : test_memory_data_layer.s + +# target to generate assembly for a file +test_memory_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s +.PHONY : test_memory_data_layer.cpp.s + +test_multinomial_logistic_loss_layer.o: test_multinomial_logistic_loss_layer.cpp.o +.PHONY : test_multinomial_logistic_loss_layer.o + +# target to build an object file +test_multinomial_logistic_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o +.PHONY : test_multinomial_logistic_loss_layer.cpp.o + +test_multinomial_logistic_loss_layer.i: test_multinomial_logistic_loss_layer.cpp.i +.PHONY : test_multinomial_logistic_loss_layer.i + +# target to preprocess a source file +test_multinomial_logistic_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i +.PHONY : test_multinomial_logistic_loss_layer.cpp.i + +test_multinomial_logistic_loss_layer.s: test_multinomial_logistic_loss_layer.cpp.s +.PHONY : test_multinomial_logistic_loss_layer.s + +# target to generate assembly for a file +test_multinomial_logistic_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s +.PHONY : test_multinomial_logistic_loss_layer.cpp.s + +test_mvn_layer.o: test_mvn_layer.cpp.o +.PHONY : test_mvn_layer.o + +# target to build an object file +test_mvn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o +.PHONY : test_mvn_layer.cpp.o + +test_mvn_layer.i: test_mvn_layer.cpp.i +.PHONY : test_mvn_layer.i + +# target to preprocess a source file +test_mvn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i +.PHONY : test_mvn_layer.cpp.i + +test_mvn_layer.s: test_mvn_layer.cpp.s +.PHONY : test_mvn_layer.s + +# target to generate assembly for a file +test_mvn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s +.PHONY : test_mvn_layer.cpp.s + +test_net.o: test_net.cpp.o +.PHONY : test_net.o + +# target to build an object file +test_net.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o +.PHONY : test_net.cpp.o + +test_net.i: test_net.cpp.i +.PHONY : test_net.i + +# target to preprocess a source file +test_net.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i +.PHONY : test_net.cpp.i + +test_net.s: test_net.cpp.s +.PHONY : test_net.s + +# target to generate assembly for a file +test_net.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s +.PHONY : test_net.cpp.s + +test_neuron_layer.o: test_neuron_layer.cpp.o +.PHONY : test_neuron_layer.o + +# target to build an object file +test_neuron_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o +.PHONY : test_neuron_layer.cpp.o + +test_neuron_layer.i: test_neuron_layer.cpp.i +.PHONY : test_neuron_layer.i + +# target to preprocess a source file +test_neuron_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i +.PHONY : test_neuron_layer.cpp.i + +test_neuron_layer.s: test_neuron_layer.cpp.s +.PHONY : test_neuron_layer.s + +# target to generate assembly for a file +test_neuron_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s +.PHONY : test_neuron_layer.cpp.s + +test_platform.o: test_platform.cpp.o +.PHONY : test_platform.o + +# target to build an object file +test_platform.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o +.PHONY : test_platform.cpp.o + +test_platform.i: test_platform.cpp.i +.PHONY : test_platform.i + +# target to preprocess a source file +test_platform.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i +.PHONY : test_platform.cpp.i + +test_platform.s: test_platform.cpp.s +.PHONY : test_platform.s + +# target to generate assembly for a file +test_platform.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s +.PHONY : test_platform.cpp.s + +test_pooling_layer.o: test_pooling_layer.cpp.o +.PHONY : test_pooling_layer.o + +# target to build an object file +test_pooling_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o +.PHONY : test_pooling_layer.cpp.o + +test_pooling_layer.i: test_pooling_layer.cpp.i +.PHONY : test_pooling_layer.i + +# target to preprocess a source file +test_pooling_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i +.PHONY : test_pooling_layer.cpp.i + +test_pooling_layer.s: test_pooling_layer.cpp.s +.PHONY : test_pooling_layer.s + +# target to generate assembly for a file +test_pooling_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s +.PHONY : test_pooling_layer.cpp.s + +test_power_layer.o: test_power_layer.cpp.o +.PHONY : test_power_layer.o + +# target to build an object file +test_power_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o +.PHONY : test_power_layer.cpp.o + +test_power_layer.i: test_power_layer.cpp.i +.PHONY : test_power_layer.i + +# target to preprocess a source file +test_power_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i +.PHONY : test_power_layer.cpp.i + +test_power_layer.s: test_power_layer.cpp.s +.PHONY : test_power_layer.s + +# target to generate assembly for a file +test_power_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s +.PHONY : test_power_layer.cpp.s + +test_protobuf.o: test_protobuf.cpp.o +.PHONY : test_protobuf.o + +# target to build an object file +test_protobuf.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o +.PHONY : test_protobuf.cpp.o + +test_protobuf.i: test_protobuf.cpp.i +.PHONY : test_protobuf.i + +# target to preprocess a source file +test_protobuf.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i +.PHONY : test_protobuf.cpp.i + +test_protobuf.s: test_protobuf.cpp.s +.PHONY : test_protobuf.s + +# target to generate assembly for a file +test_protobuf.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s +.PHONY : test_protobuf.cpp.s + +test_random_number_generator.o: test_random_number_generator.cpp.o +.PHONY : test_random_number_generator.o + +# target to build an object file +test_random_number_generator.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o +.PHONY : test_random_number_generator.cpp.o + +test_random_number_generator.i: test_random_number_generator.cpp.i +.PHONY : test_random_number_generator.i + +# target to preprocess a source file +test_random_number_generator.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i +.PHONY : test_random_number_generator.cpp.i + +test_random_number_generator.s: test_random_number_generator.cpp.s +.PHONY : test_random_number_generator.s + +# target to generate assembly for a file +test_random_number_generator.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s +.PHONY : test_random_number_generator.cpp.s + +test_reduction_layer.o: test_reduction_layer.cpp.o +.PHONY : test_reduction_layer.o + +# target to build an object file +test_reduction_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o +.PHONY : test_reduction_layer.cpp.o + +test_reduction_layer.i: test_reduction_layer.cpp.i +.PHONY : test_reduction_layer.i + +# target to preprocess a source file +test_reduction_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i +.PHONY : test_reduction_layer.cpp.i + +test_reduction_layer.s: test_reduction_layer.cpp.s +.PHONY : test_reduction_layer.s + +# target to generate assembly for a file +test_reduction_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s +.PHONY : test_reduction_layer.cpp.s + +test_reshape_layer.o: test_reshape_layer.cpp.o +.PHONY : test_reshape_layer.o + +# target to build an object file +test_reshape_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o +.PHONY : test_reshape_layer.cpp.o + +test_reshape_layer.i: test_reshape_layer.cpp.i +.PHONY : test_reshape_layer.i + +# target to preprocess a source file +test_reshape_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i +.PHONY : test_reshape_layer.cpp.i + +test_reshape_layer.s: test_reshape_layer.cpp.s +.PHONY : test_reshape_layer.s + +# target to generate assembly for a file +test_reshape_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s +.PHONY : test_reshape_layer.cpp.s + +test_sigmoid_cross_entropy_loss_layer.o: test_sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : test_sigmoid_cross_entropy_loss_layer.o + +# target to build an object file +test_sigmoid_cross_entropy_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.o + +test_sigmoid_cross_entropy_loss_layer.i: test_sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : test_sigmoid_cross_entropy_loss_layer.i + +# target to preprocess a source file +test_sigmoid_cross_entropy_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.i + +test_sigmoid_cross_entropy_loss_layer.s: test_sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : test_sigmoid_cross_entropy_loss_layer.s + +# target to generate assembly for a file +test_sigmoid_cross_entropy_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.s + +test_slice_layer.o: test_slice_layer.cpp.o +.PHONY : test_slice_layer.o + +# target to build an object file +test_slice_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o +.PHONY : test_slice_layer.cpp.o + +test_slice_layer.i: test_slice_layer.cpp.i +.PHONY : test_slice_layer.i + +# target to preprocess a source file +test_slice_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i +.PHONY : test_slice_layer.cpp.i + +test_slice_layer.s: test_slice_layer.cpp.s +.PHONY : test_slice_layer.s + +# target to generate assembly for a file +test_slice_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s +.PHONY : test_slice_layer.cpp.s + +test_softmax_layer.o: test_softmax_layer.cpp.o +.PHONY : test_softmax_layer.o + +# target to build an object file +test_softmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o +.PHONY : test_softmax_layer.cpp.o + +test_softmax_layer.i: test_softmax_layer.cpp.i +.PHONY : test_softmax_layer.i + +# target to preprocess a source file +test_softmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i +.PHONY : test_softmax_layer.cpp.i + +test_softmax_layer.s: test_softmax_layer.cpp.s +.PHONY : test_softmax_layer.s + +# target to generate assembly for a file +test_softmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s +.PHONY : test_softmax_layer.cpp.s + +test_softmax_with_loss_layer.o: test_softmax_with_loss_layer.cpp.o +.PHONY : test_softmax_with_loss_layer.o + +# target to build an object file +test_softmax_with_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o +.PHONY : test_softmax_with_loss_layer.cpp.o + +test_softmax_with_loss_layer.i: test_softmax_with_loss_layer.cpp.i +.PHONY : test_softmax_with_loss_layer.i + +# target to preprocess a source file +test_softmax_with_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i +.PHONY : test_softmax_with_loss_layer.cpp.i + +test_softmax_with_loss_layer.s: test_softmax_with_loss_layer.cpp.s +.PHONY : test_softmax_with_loss_layer.s + +# target to generate assembly for a file +test_softmax_with_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s +.PHONY : test_softmax_with_loss_layer.cpp.s + +test_solver.o: test_solver.cpp.o +.PHONY : test_solver.o + +# target to build an object file +test_solver.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o +.PHONY : test_solver.cpp.o + +test_solver.i: test_solver.cpp.i +.PHONY : test_solver.i + +# target to preprocess a source file +test_solver.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i +.PHONY : test_solver.cpp.i + +test_solver.s: test_solver.cpp.s +.PHONY : test_solver.s + +# target to generate assembly for a file +test_solver.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s +.PHONY : test_solver.cpp.s + +test_split_layer.o: test_split_layer.cpp.o +.PHONY : test_split_layer.o + +# target to build an object file +test_split_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o +.PHONY : test_split_layer.cpp.o + +test_split_layer.i: test_split_layer.cpp.i +.PHONY : test_split_layer.i + +# target to preprocess a source file +test_split_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i +.PHONY : test_split_layer.cpp.i + +test_split_layer.s: test_split_layer.cpp.s +.PHONY : test_split_layer.s + +# target to generate assembly for a file +test_split_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s +.PHONY : test_split_layer.cpp.s + +test_spp_layer.o: test_spp_layer.cpp.o +.PHONY : test_spp_layer.o + +# target to build an object file +test_spp_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o +.PHONY : test_spp_layer.cpp.o + +test_spp_layer.i: test_spp_layer.cpp.i +.PHONY : test_spp_layer.i + +# target to preprocess a source file +test_spp_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i +.PHONY : test_spp_layer.cpp.i + +test_spp_layer.s: test_spp_layer.cpp.s +.PHONY : test_spp_layer.s + +# target to generate assembly for a file +test_spp_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s +.PHONY : test_spp_layer.cpp.s + +test_stochastic_pooling.o: test_stochastic_pooling.cpp.o +.PHONY : test_stochastic_pooling.o + +# target to build an object file +test_stochastic_pooling.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o +.PHONY : test_stochastic_pooling.cpp.o + +test_stochastic_pooling.i: test_stochastic_pooling.cpp.i +.PHONY : test_stochastic_pooling.i + +# target to preprocess a source file +test_stochastic_pooling.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i +.PHONY : test_stochastic_pooling.cpp.i + +test_stochastic_pooling.s: test_stochastic_pooling.cpp.s +.PHONY : test_stochastic_pooling.s + +# target to generate assembly for a file +test_stochastic_pooling.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s +.PHONY : test_stochastic_pooling.cpp.s + +test_syncedmem.o: test_syncedmem.cpp.o +.PHONY : test_syncedmem.o + +# target to build an object file +test_syncedmem.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o +.PHONY : test_syncedmem.cpp.o + +test_syncedmem.i: test_syncedmem.cpp.i +.PHONY : test_syncedmem.i + +# target to preprocess a source file +test_syncedmem.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i +.PHONY : test_syncedmem.cpp.i + +test_syncedmem.s: test_syncedmem.cpp.s +.PHONY : test_syncedmem.s + +# target to generate assembly for a file +test_syncedmem.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s +.PHONY : test_syncedmem.cpp.s + +test_tanh_layer.o: test_tanh_layer.cpp.o +.PHONY : test_tanh_layer.o + +# target to build an object file +test_tanh_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o +.PHONY : test_tanh_layer.cpp.o + +test_tanh_layer.i: test_tanh_layer.cpp.i +.PHONY : test_tanh_layer.i + +# target to preprocess a source file +test_tanh_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i +.PHONY : test_tanh_layer.cpp.i + +test_tanh_layer.s: test_tanh_layer.cpp.s +.PHONY : test_tanh_layer.s + +# target to generate assembly for a file +test_tanh_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s +.PHONY : test_tanh_layer.cpp.s + +test_threshold_layer.o: test_threshold_layer.cpp.o +.PHONY : test_threshold_layer.o + +# target to build an object file +test_threshold_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o +.PHONY : test_threshold_layer.cpp.o + +test_threshold_layer.i: test_threshold_layer.cpp.i +.PHONY : test_threshold_layer.i + +# target to preprocess a source file +test_threshold_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i +.PHONY : test_threshold_layer.cpp.i + +test_threshold_layer.s: test_threshold_layer.cpp.s +.PHONY : test_threshold_layer.s + +# target to generate assembly for a file +test_threshold_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s +.PHONY : test_threshold_layer.cpp.s + +test_upgrade_proto.o: test_upgrade_proto.cpp.o +.PHONY : test_upgrade_proto.o + +# target to build an object file +test_upgrade_proto.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o +.PHONY : test_upgrade_proto.cpp.o + +test_upgrade_proto.i: test_upgrade_proto.cpp.i +.PHONY : test_upgrade_proto.i + +# target to preprocess a source file +test_upgrade_proto.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i +.PHONY : test_upgrade_proto.cpp.i + +test_upgrade_proto.s: test_upgrade_proto.cpp.s +.PHONY : test_upgrade_proto.s + +# target to generate assembly for a file +test_upgrade_proto.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s +.PHONY : test_upgrade_proto.cpp.s + +test_util_blas.o: test_util_blas.cpp.o +.PHONY : test_util_blas.o + +# target to build an object file +test_util_blas.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o +.PHONY : test_util_blas.cpp.o + +test_util_blas.i: test_util_blas.cpp.i +.PHONY : test_util_blas.i + +# target to preprocess a source file +test_util_blas.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i +.PHONY : test_util_blas.cpp.i + +test_util_blas.s: test_util_blas.cpp.s +.PHONY : test_util_blas.s + +# target to generate assembly for a file +test_util_blas.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s +.PHONY : test_util_blas.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... install" + @echo "... install/local" + @echo "... install/strip" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... runtest" + @echo "... test.testbin" + @echo "... test_accuracy_layer.o" + @echo "... test_accuracy_layer.i" + @echo "... test_accuracy_layer.s" + @echo "... test_argmax_layer.o" + @echo "... test_argmax_layer.i" + @echo "... test_argmax_layer.s" + @echo "... test_benchmark.o" + @echo "... test_benchmark.i" + @echo "... test_benchmark.s" + @echo "... test_blob.o" + @echo "... test_blob.i" + @echo "... test_blob.s" + @echo "... test_caffe_main.o" + @echo "... test_caffe_main.i" + @echo "... test_caffe_main.s" + @echo "... test_common.o" + @echo "... test_common.i" + @echo "... test_common.s" + @echo "... test_concat_layer.o" + @echo "... test_concat_layer.i" + @echo "... test_concat_layer.s" + @echo "... test_contrastive_loss_layer.o" + @echo "... test_contrastive_loss_layer.i" + @echo "... test_contrastive_loss_layer.s" + @echo "... test_convolution_layer.o" + @echo "... test_convolution_layer.i" + @echo "... test_convolution_layer.s" + @echo "... test_data_layer.o" + @echo "... test_data_layer.i" + @echo "... test_data_layer.s" + @echo "... test_data_transformer.o" + @echo "... test_data_transformer.i" + @echo "... test_data_transformer.s" + @echo "... test_db.o" + @echo "... test_db.i" + @echo "... test_db.s" + @echo "... test_deconvolution_layer.o" + @echo "... test_deconvolution_layer.i" + @echo "... test_deconvolution_layer.s" + @echo "... test_dummy_data_layer.o" + @echo "... test_dummy_data_layer.i" + @echo "... test_dummy_data_layer.s" + @echo "... test_eltwise_layer.o" + @echo "... test_eltwise_layer.i" + @echo "... test_eltwise_layer.s" + @echo "... test_euclidean_loss_layer.o" + @echo "... test_euclidean_loss_layer.i" + @echo "... test_euclidean_loss_layer.s" + @echo "... test_filler.o" + @echo "... test_filler.i" + @echo "... test_filler.s" + @echo "... test_filter_layer.o" + @echo "... test_filter_layer.i" + @echo "... test_filter_layer.s" + @echo "... test_flatten_layer.o" + @echo "... test_flatten_layer.i" + @echo "... test_flatten_layer.s" + @echo "... test_gradient_based_solver.o" + @echo "... test_gradient_based_solver.i" + @echo "... test_gradient_based_solver.s" + @echo "... test_hdf5_output_layer.o" + @echo "... test_hdf5_output_layer.i" + @echo "... test_hdf5_output_layer.s" + @echo "... test_hdf5data_layer.o" + @echo "... test_hdf5data_layer.i" + @echo "... test_hdf5data_layer.s" + @echo "... test_hinge_loss_layer.o" + @echo "... test_hinge_loss_layer.i" + @echo "... test_hinge_loss_layer.s" + @echo "... test_im2col_layer.o" + @echo "... test_im2col_layer.i" + @echo "... test_im2col_layer.s" + @echo "... test_image_data_layer.o" + @echo "... test_image_data_layer.i" + @echo "... test_image_data_layer.s" + @echo "... test_infogain_loss_layer.o" + @echo "... test_infogain_loss_layer.i" + @echo "... test_infogain_loss_layer.s" + @echo "... test_inner_product_layer.o" + @echo "... test_inner_product_layer.i" + @echo "... test_inner_product_layer.s" + @echo "... test_internal_thread.o" + @echo "... test_internal_thread.i" + @echo "... test_internal_thread.s" + @echo "... test_io.o" + @echo "... test_io.i" + @echo "... test_io.s" + @echo "... test_layer_factory.o" + @echo "... test_layer_factory.i" + @echo "... test_layer_factory.s" + @echo "... test_lrn_layer.o" + @echo "... test_lrn_layer.i" + @echo "... test_lrn_layer.s" + @echo "... test_math_functions.o" + @echo "... test_math_functions.i" + @echo "... test_math_functions.s" + @echo "... test_maxpool_dropout_layers.o" + @echo "... test_maxpool_dropout_layers.i" + @echo "... test_maxpool_dropout_layers.s" + @echo "... test_memory_data_layer.o" + @echo "... test_memory_data_layer.i" + @echo "... test_memory_data_layer.s" + @echo "... test_multinomial_logistic_loss_layer.o" + @echo "... test_multinomial_logistic_loss_layer.i" + @echo "... test_multinomial_logistic_loss_layer.s" + @echo "... test_mvn_layer.o" + @echo "... test_mvn_layer.i" + @echo "... test_mvn_layer.s" + @echo "... test_net.o" + @echo "... test_net.i" + @echo "... test_net.s" + @echo "... test_neuron_layer.o" + @echo "... test_neuron_layer.i" + @echo "... test_neuron_layer.s" + @echo "... test_platform.o" + @echo "... test_platform.i" + @echo "... test_platform.s" + @echo "... test_pooling_layer.o" + @echo "... test_pooling_layer.i" + @echo "... test_pooling_layer.s" + @echo "... test_power_layer.o" + @echo "... test_power_layer.i" + @echo "... test_power_layer.s" + @echo "... test_protobuf.o" + @echo "... test_protobuf.i" + @echo "... test_protobuf.s" + @echo "... test_random_number_generator.o" + @echo "... test_random_number_generator.i" + @echo "... test_random_number_generator.s" + @echo "... test_reduction_layer.o" + @echo "... test_reduction_layer.i" + @echo "... test_reduction_layer.s" + @echo "... test_reshape_layer.o" + @echo "... test_reshape_layer.i" + @echo "... test_reshape_layer.s" + @echo "... test_sigmoid_cross_entropy_loss_layer.o" + @echo "... test_sigmoid_cross_entropy_loss_layer.i" + @echo "... test_sigmoid_cross_entropy_loss_layer.s" + @echo "... test_slice_layer.o" + @echo "... test_slice_layer.i" + @echo "... test_slice_layer.s" + @echo "... test_softmax_layer.o" + @echo "... test_softmax_layer.i" + @echo "... test_softmax_layer.s" + @echo "... test_softmax_with_loss_layer.o" + @echo "... test_softmax_with_loss_layer.i" + @echo "... test_softmax_with_loss_layer.s" + @echo "... test_solver.o" + @echo "... test_solver.i" + @echo "... test_solver.s" + @echo "... test_split_layer.o" + @echo "... test_split_layer.i" + @echo "... test_split_layer.s" + @echo "... test_spp_layer.o" + @echo "... test_spp_layer.i" + @echo "... test_spp_layer.s" + @echo "... test_stochastic_pooling.o" + @echo "... test_stochastic_pooling.i" + @echo "... test_stochastic_pooling.s" + @echo "... test_syncedmem.o" + @echo "... test_syncedmem.i" + @echo "... test_syncedmem.s" + @echo "... test_tanh_layer.o" + @echo "... test_tanh_layer.i" + @echo "... test_tanh_layer.s" + @echo "... test_threshold_layer.o" + @echo "... test_threshold_layer.i" + @echo "... test_threshold_layer.s" + @echo "... test_upgrade_proto.o" + @echo "... test_upgrade_proto.i" + @echo "... test_upgrade_proto.s" + @echo "... test_util_blas.o" + @echo "... test_util_blas.i" + @echo "... test_util_blas.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/caffe/test/cmake_install.cmake b/src/caffe/test/cmake_install.cmake new file mode 100644 index 00000000..fa890cd7 --- /dev/null +++ b/src/caffe/test/cmake_install.cmake @@ -0,0 +1,34 @@ +# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test + +# Set the install prefix +IF(NOT DEFINED CMAKE_INSTALL_PREFIX) + SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") +ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) +STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + IF(BUILD_TYPE) + STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + ELSE(BUILD_TYPE) + SET(CMAKE_INSTALL_CONFIG_NAME "Release") + ENDIF(BUILD_TYPE) + MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + +# Set the component getting installed. +IF(NOT CMAKE_INSTALL_COMPONENT) + IF(COMPONENT) + MESSAGE(STATUS "Install component: \"${COMPONENT}\"") + SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + ELSE(COMPONENT) + SET(CMAKE_INSTALL_COMPONENT) + ENDIF(COMPONENT) +ENDIF(NOT CMAKE_INSTALL_COMPONENT) + +# Install shared libraries without execute permission? +IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + SET(CMAKE_INSTALL_SO_NO_EXE "1") +ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp index c8caf5ac..5f41d325 100644 --- a/src/caffe/test/test_caffe_main.cpp +++ b/src/caffe/test/test_caffe_main.cpp @@ -2,6 +2,7 @@ // to allow a main function to be compiled into the binary. #include "caffe/caffe.hpp" +#include "caffe/common.hpp" #include "caffe/test/test_caffe_main.hpp" namespace caffe { @@ -12,6 +13,7 @@ namespace caffe { #ifndef CPU_ONLY using caffe::CAFFE_TEST_CUDA_PROP; + #endif int main(int argc, char** argv) { @@ -19,8 +21,8 @@ int main(int argc, char** argv) { caffe::GlobalInit(&argc, &argv); #ifndef CPU_ONLY // Before starting testing, let's first print out a few cuda defice info. - int device; - cudaGetDeviceCount(&device); + int device = 0; +// cudaGetDeviceCount(&device); cout << "Cuda number of devices: " << device << endl; if (argc > 1) { // Use the given device @@ -31,9 +33,11 @@ int main(int argc, char** argv) { // Use the device assigned in build configuration; but with a lower priority device = CUDA_TEST_DEVICE; } - cudaGetDevice(&device); +// cudaGetDevice(&device); cout << "Current device id: " << device << endl; - cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device); + // cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device); +// caffe::set_mode(caffe::GPU); + caffe::amdDevice.Init(); #endif // invoke the test. return RUN_ALL_TESTS(); diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index a8c5a83f..6942f8a3 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -25,15 +25,7 @@ Timer::~Timer() { void Timer::Start() { if (!running()) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventRecord(start_gpu_, 0)); -#else - NO_GPU; -#endif - } else { - start_cpu_ = boost::posix_time::microsec_clock::local_time(); - } + start_cpu_ = boost::posix_time::microsec_clock::local_time(); running_ = true; has_run_at_least_once_ = true; } @@ -41,16 +33,7 @@ void Timer::Start() { void Timer::Stop() { if (running()) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventRecord(stop_gpu_, 0)); - CUDA_CHECK(cudaEventSynchronize(stop_gpu_)); -#else - NO_GPU; -#endif - } else { - stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - } + stop_cpu_ = boost::posix_time::microsec_clock::local_time(); running_ = false; } } @@ -64,18 +47,8 @@ float Timer::MicroSeconds() { if (running()) { Stop(); } - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, - stop_gpu_)); - // Cuda only measure milliseconds - elapsed_microseconds_ = elapsed_milliseconds_ * 1000; -#else - NO_GPU; -#endif - } else { - elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); - } + + elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); return elapsed_microseconds_; } @@ -87,16 +60,8 @@ float Timer::MilliSeconds() { if (running()) { Stop(); } - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, - stop_gpu_)); -#else - NO_GPU; -#endif - } else { - elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); - } + + elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); return elapsed_milliseconds_; } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 677afcdf..3bef8b63 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -214,6 +214,26 @@ void caffe_gpu_axpy(const int N, const double alpha, const double* X, CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) ); } +template<> +void caffe_gpu_sgnbit(const int n, const float* x, float* y) +{ +} + +template<> +void caffe_gpu_sgnbit(const int n, const double* x, double* y) +{ +} + +template<> +void caffe_gpu_abs(const int n, const float* x, float* y) +{ +} + +template<> +void caffe_gpu_abs(const int n, const double* x, double* y) +{ +} + template <> void caffe_set(const int N, const float alpha, float* Y) { if (alpha == 0) { @@ -260,6 +280,12 @@ void caffe_copy(const int N, const double* X, double* Y) { cblas_dcopy(N, X, 1, Y, 1); } +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) +{ + OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); +} + template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { if(X != Y) diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 01c04711..e4fd42c6 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -51,6 +51,7 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count){ } // Explicit instantiation +template void ocl_memset(int* buffer, const int value, const int count); template void ocl_memset(float* buffer, const float value, const int count); template void ocl_memset(double* buffer, const double value, const int count); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index a9abda2e..9eab08ec 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -35,8 +35,8 @@ namespace caffe { typedef unsigned int uint32_t; struct array4x32 { uint32_t v[4]; }; - -template std::string get_dtype_suffix() +/* +template inline std::string get_dtype_suffix() { dtype x; const char type = typeid(x).name()[0]; @@ -49,7 +49,7 @@ template std::string get_dtype_suffix() } return suffix; } - +*/ template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold) { @@ -1083,5 +1083,75 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, } template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); -} // namespace caffe +template +void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) +{ +/* std::string kernel_name = "Conv" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + + int weights_stride = kernel_w * kernel_h;//correct? + int bot_stride = width; + int bot_channel_stride = width * height; + int bot_batch_stride = width * height * channel_in; + + int top_stride = width_out; + int top_channel_stride = width_out * height_out; + int top_batch_stride = width_out * height_out * channel_out; + + //int height_out = (int)top->getDim(ANN_TENSOR_HEIGHT); + //int width_out = (int)top->getDim(ANN_TENSOR_WIDTH); + int vis_height = height_out * stride - 2 * pad; + int vis_width = width_out * stride - 2 * pad; + + int ocl_group_sz0_ = 8; + int ocl_group_sz1_ = 8; + int ocl_group_lg2sz1_ = (int)ceil(log((double)ocl_group_sz1_)/log(2.)); + int ocl_group_lg2sz0_ = (int)ceil(log((double)ocl_group_sz0_)/log(2.)); + + int outputs = channel_out; + int n_out_pix_horiz_ = (width_out < 2 * ocl_group_sz0_) ? 1 : (width_out < 4 * ocl_group_sz0_) ? 2 : 4; + int n_out_pix_vert_ = (height_out < 2 * ocl_group_sz1_) ? 1 : 2; // (height_out <= 192) ? 2 : 4; + int n_outs_ = ((outputs & 1) == 1) ? 1 : (kernel_w == 3) && ((outputs / 4) * 4 == outputs) ? 4 : 2; // (n_out_pix_horiz_ >= 4) ? 1 : 2; + + int n_outputs = channel_out; + n_outputs /= n_outs_; + int i_n_group_horiz = (width_out + ocl_group_sz0_ * n_out_pix_horiz_ - 1) / (ocl_group_sz0_ * n_out_pix_horiz_); + int i_n_group_vert = (height_out + ocl_group_sz1_ * n_out_pix_vert_ - 1) / (ocl_group_sz1_ * n_out_pix_vert_); + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&weights); + ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&bias); + ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(ker_rand, 1, sizeof(cl_int), (void*)&kernel_w); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&channel_out); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&channel_in); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&pad); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_out_pix_horiz_); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_out_pix_vert_); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_batch_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_channel_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_batch_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_channel_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&vis_width); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&vis_height); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&weights_stride); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&width_out); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&height_out); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_outs_); + OCL_CHECK(ret); + + size_t l_wk[3] = { ocl_group_sz0_, ocl_group_sz1_, 1}; + size_t g_wk[3] = { i_n_group_horiz * l_wk[0], i_n_group_vert * l_wk[1], batch_sz * n_outputs }; + + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );*/ +} +template void ocl_conv(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); +template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); + +} // namespace caffe diff --git a/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake b/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake new file mode 100644 index 00000000..7bb0014c --- /dev/null +++ b/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake @@ -0,0 +1,16 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Relative path conversion top directories. +SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe") +SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe") + +# Force unix paths in dependencies. +SET(CMAKE_FORCE_UNIX_PATHS 1) + + +# The C and CXX include file regular expressions for this directory. +SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$") +SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$") +SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN}) +SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN}) diff --git a/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake b/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake new file mode 100644 index 00000000..76e46409 --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake @@ -0,0 +1,32 @@ +# The set of languages for which implicit dependencies are needed: +SET(CMAKE_DEPENDS_LANGUAGES + "CXX" + ) +# The set of files for implicit dependencies of each language: +SET(CMAKE_DEPENDS_CHECK_CXX + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o" + ) +SET(CMAKE_CXX_COMPILER_ID "GNU") + +# Preprocessor definitions for this target. +SET(CMAKE_TARGET_DEFINITIONS + "GTEST_USE_OWN_TR1_TUPLE" + ) + +# Targets to which this target links. +SET(CMAKE_TARGET_LINKED_INFO_FILES + ) + +# The include file search paths: +SET(CMAKE_C_TARGET_INCLUDE_PATH + "src" + "/usr/local/include" + "include" + "/usr/local/cuda/include" + "/usr/local/include/opencv" + "/usr/include/atlas" + "." + ) +SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) +SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/gtest/CMakeFiles/gtest.dir/build.make b/src/gtest/CMakeFiles/gtest.dir/build.make new file mode 100644 index 00000000..b41ed414 --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/build.make @@ -0,0 +1,106 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# Include any dependencies generated for this target. +include src/gtest/CMakeFiles/gtest.dir/depend.make + +# Include the progress variables for this target. +include src/gtest/CMakeFiles/gtest.dir/progress.make + +# Include the compile flags for this target's objects. +include src/gtest/CMakeFiles/gtest.dir/flags.make + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/CMakeFiles/gtest.dir/flags.make +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/gtest-all.cpp + $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/gtest.dir/gtest-all.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/gtest.dir/gtest-all.cpp.i" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp > CMakeFiles/gtest.dir/gtest-all.cpp.i + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s: cmake_force + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/gtest.dir/gtest-all.cpp.s" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp -o CMakeFiles/gtest.dir/gtest-all.cpp.s + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires: +.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires + $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build +.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides + +src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o + +# Object files for target gtest +gtest_OBJECTS = \ +"CMakeFiles/gtest.dir/gtest-all.cpp.o" + +# External object files for target gtest +gtest_EXTERNAL_OBJECTS = + +lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o +lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/build.make +lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/link.txt + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libgtest.a" + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean_target.cmake + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/gtest.dir/link.txt --verbose=$(VERBOSE) + +# Rule to build all files generated by this target. +src/gtest/CMakeFiles/gtest.dir/build: lib/libgtest.a +.PHONY : src/gtest/CMakeFiles/gtest.dir/build + +src/gtest/CMakeFiles/gtest.dir/requires: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires +.PHONY : src/gtest/CMakeFiles/gtest.dir/requires + +src/gtest/CMakeFiles/gtest.dir/clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean.cmake +.PHONY : src/gtest/CMakeFiles/gtest.dir/clean + +src/gtest/CMakeFiles/gtest.dir/depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake --color=$(COLOR) +.PHONY : src/gtest/CMakeFiles/gtest.dir/depend + diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake new file mode 100644 index 00000000..694feb83 --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake @@ -0,0 +1,10 @@ +FILE(REMOVE_RECURSE + "CMakeFiles/gtest.dir/gtest-all.cpp.o" + "../../lib/libgtest.pdb" + "../../lib/libgtest.a" +) + +# Per-language clean rules from dependency scanning. +FOREACH(lang CXX) + INCLUDE(CMakeFiles/gtest.dir/cmake_clean_${lang}.cmake OPTIONAL) +ENDFOREACH(lang) diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake new file mode 100644 index 00000000..2c9ec14f --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake @@ -0,0 +1,3 @@ +FILE(REMOVE_RECURSE + "../../lib/libgtest.a" +) diff --git a/src/gtest/CMakeFiles/gtest.dir/depend.make b/src/gtest/CMakeFiles/gtest.dir/depend.make new file mode 100644 index 00000000..37ac348d --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/depend.make @@ -0,0 +1,2 @@ +# Empty dependencies file for gtest. +# This may be replaced when dependencies are built. diff --git a/src/gtest/CMakeFiles/gtest.dir/flags.make b/src/gtest/CMakeFiles/gtest.dir/flags.make new file mode 100644 index 00000000..8b4ef992 --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/flags.make @@ -0,0 +1,8 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# compile CXX with /usr/bin/c++ +CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe + +CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE + diff --git a/src/gtest/CMakeFiles/gtest.dir/link.txt b/src/gtest/CMakeFiles/gtest.dir/link.txt new file mode 100644 index 00000000..e5645cfb --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/link.txt @@ -0,0 +1,2 @@ +/usr/bin/ar cr ../../lib/libgtest.a CMakeFiles/gtest.dir/gtest-all.cpp.o +/usr/bin/ranlib ../../lib/libgtest.a diff --git a/src/gtest/CMakeFiles/gtest.dir/progress.make b/src/gtest/CMakeFiles/gtest.dir/progress.make new file mode 100644 index 00000000..143c9b1b --- /dev/null +++ b/src/gtest/CMakeFiles/gtest.dir/progress.make @@ -0,0 +1,2 @@ +CMAKE_PROGRESS_1 = 65 + diff --git a/src/gtest/CMakeFiles/progress.marks b/src/gtest/CMakeFiles/progress.marks new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/src/gtest/CMakeFiles/progress.marks @@ -0,0 +1 @@ +0 diff --git a/src/gtest/Makefile b/src/gtest/Makefile new file mode 100644 index 00000000..d1a96ceb --- /dev/null +++ b/src/gtest/Makefile @@ -0,0 +1,212 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: install/local +.PHONY : install/local/fast + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: install/strip +.PHONY : install/strip/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/progress.marks + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/all + $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/gtest/CMakeFiles/gtest.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/CMakeFiles/gtest.dir/rule +.PHONY : src/gtest/CMakeFiles/gtest.dir/rule + +# Convenience name for target. +gtest: src/gtest/CMakeFiles/gtest.dir/rule +.PHONY : gtest + +# fast build rule for target. +gtest/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/build +.PHONY : gtest/fast + +gtest-all.o: gtest-all.cpp.o +.PHONY : gtest-all.o + +# target to build an object file +gtest-all.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o +.PHONY : gtest-all.cpp.o + +gtest-all.i: gtest-all.cpp.i +.PHONY : gtest-all.i + +# target to preprocess a source file +gtest-all.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i +.PHONY : gtest-all.cpp.i + +gtest-all.s: gtest-all.cpp.s +.PHONY : gtest-all.s + +# target to generate assembly for a file +gtest-all.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s +.PHONY : gtest-all.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... gtest" + @echo "... install" + @echo "... install/local" + @echo "... install/strip" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... gtest-all.o" + @echo "... gtest-all.i" + @echo "... gtest-all.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/gtest/cmake_install.cmake b/src/gtest/cmake_install.cmake new file mode 100644 index 00000000..14c33dd5 --- /dev/null +++ b/src/gtest/cmake_install.cmake @@ -0,0 +1,34 @@ +# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest + +# Set the install prefix +IF(NOT DEFINED CMAKE_INSTALL_PREFIX) + SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") +ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) +STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + IF(BUILD_TYPE) + STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + ELSE(BUILD_TYPE) + SET(CMAKE_INSTALL_CONFIG_NAME "Release") + ENDIF(BUILD_TYPE) + MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + +# Set the component getting installed. +IF(NOT CMAKE_INSTALL_COMPONENT) + IF(COMPONENT) + MESSAGE(STATUS "Install component: \"${COMPONENT}\"") + SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + ELSE(COMPONENT) + SET(CMAKE_INSTALL_COMPONENT) + ENDIF(COMPONENT) +ENDIF(NOT CMAKE_INSTALL_COMPONENT) + +# Install shared libraries without execute permission? +IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + SET(CMAKE_INSTALL_SO_NO_EXE "1") +ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + From a45174ceee5506f935d4b0ac16e8b516440bea61 Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 25 Aug 2015 14:36:34 +0800 Subject: [PATCH 036/124] modified the packing number --- include/caffe/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 4cd372a6..8113c181 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -84,7 +84,7 @@ private:\ #define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -#define global_packing_N 32 +#define global_packing_N 16 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ From 5822b9357570bebe66f3ff69690c6280b5e782be Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 27 Aug 2015 15:00:54 +0800 Subject: [PATCH 037/124] remove all cuda related flags in Makefile --- Makefile | 75 ++++++++----------------------------------------- Makefile.config | 4 +-- 2 files changed, 13 insertions(+), 66 deletions(-) diff --git a/Makefile b/Makefile index f0ac9e06..905a19c3 100644 --- a/Makefile +++ b/Makefile @@ -38,13 +38,10 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so ############################## # CXX_SRCS are the source files excluding the test ones. CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp") -# CU_SRCS are the cuda source files -#CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu") # TEST_SRCS are the test source files TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp") TEST_SRCS := $(filter-out $(TEST_MAIN_SRC), $(TEST_SRCS)) -TEST_CU_SRCS := $(shell find src/$(PROJECT) -name "test_*.cu") GTEST_SRC := src/gtest/gtest-all.cpp # TOOL_SRCS are the source files for the tool binaries TOOL_SRCS := $(shell find tools -name "*.cpp") @@ -68,7 +65,7 @@ NONGEN_CXX_SRCS := $(shell find \ matlab/+$(PROJECT)/private \ examples \ tools \ - -name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh") + -name "*.cpp" -or -name "*.hpp") LINT_SCRIPT := scripts/cpp_lint.py LINT_OUTPUT_DIR := $(BUILD_DIR)/.lint LINT_EXT := lint.txt @@ -103,22 +100,19 @@ PROTO_GEN_PY := $(foreach file,${PROTO_SRCS:.proto=_pb2.py}, \ # These objects will be linked into the final shared library, so we # exclude the tool, example, and test objects. CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o}) -CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o}) PROTO_OBJS := ${PROTO_GEN_CC:.cc=.o} -OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS) +OBJS := $(PROTO_OBJS) $(CXX_OBJS) # tool, example, and test objects TOOL_OBJS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o}) TOOL_BUILD_DIR := $(BUILD_DIR)/tools TEST_CXX_BUILD_DIR := $(BUILD_DIR)/src/$(PROJECT)/test -TEST_CU_BUILD_DIR := $(BUILD_DIR)/cuda/src/$(PROJECT)/test TEST_CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o}) -TEST_CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o}) -TEST_OBJS := $(TEST_CXX_OBJS) $(TEST_CU_OBJS) +TEST_OBJS := $(TEST_CXX_OBJS) GTEST_OBJ := $(addprefix $(BUILD_DIR)/, ${GTEST_SRC:.cpp=.o}) EXAMPLE_OBJS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o}) # Output files for automatic dependency generation -DEPS := ${CXX_OBJS:.o=.d} ${CU_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \ - ${TEST_CU_OBJS:.o=.d} $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d} +DEPS := ${CXX_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \ + $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d} # tool, example, and test bins TOOL_BINS := ${TOOL_OBJS:.o=.bin} EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin} @@ -126,11 +120,9 @@ EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin} TOOL_BIN_LINKS := ${TOOL_BINS:.bin=} # Put the test binaries in build/test for convenience. TEST_BIN_DIR := $(BUILD_DIR)/test -TEST_CU_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \ - $(foreach obj,$(TEST_CU_OBJS),$(basename $(notdir $(obj)))))) TEST_CXX_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \ $(foreach obj,$(TEST_CXX_OBJS),$(basename $(notdir $(obj)))))) -TEST_BINS := $(TEST_CXX_BINS) $(TEST_CU_BINS) +TEST_BINS := $(TEST_CXX_BINS) # TEST_ALL_BIN is the test binary that links caffe dynamically. TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin @@ -139,30 +131,15 @@ TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin ############################## WARNS_EXT := warnings.txt CXX_WARNS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o.$(WARNS_EXT)}) -CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o.$(WARNS_EXT)}) TOOL_WARNS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o.$(WARNS_EXT)}) EXAMPLE_WARNS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o.$(WARNS_EXT)}) TEST_WARNS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o.$(WARNS_EXT)}) -TEST_CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o.$(WARNS_EXT)}) ALL_CXX_WARNS := $(CXX_WARNS) $(TOOL_WARNS) $(EXAMPLE_WARNS) $(TEST_WARNS) -ALL_CU_WARNS := $(CU_WARNS) $(TEST_CU_WARNS) -ALL_WARNS := $(ALL_CXX_WARNS) $(ALL_CU_WARNS) +ALL_WARNS := $(ALL_CXX_WARNS) EMPTY_WARN_REPORT := $(BUILD_DIR)/.$(WARNS_EXT) NONEMPTY_WARN_REPORT := $(BUILD_DIR)/$(WARNS_EXT) -############################## -# Derive include and lib directories -############################## -CUDA_INCLUDE_DIR := $(CUDA_DIR)/include - -CUDA_LIB_DIR := -# add /lib64 only if it exists -ifneq ("$(wildcard $(CUDA_DIR)/lib64)","") - CUDA_LIB_DIR += $(CUDA_DIR)/lib64 -endif -CUDA_LIB_DIR += $(CUDA_DIR)/lib - ################################# # OpenCL include and library ################################# @@ -189,10 +166,6 @@ endif INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include ifneq ($(CPU_ONLY), 1) - INCLUDE_DIRS += $(CUDA_INCLUDE_DIR) - LIBRARY_DIRS += $(CUDA_LIB_DIR) - LIBRARIES := cudart cublas curand - INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR) LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR) LIBRARIES += OpenCL clBLAS @@ -216,7 +189,6 @@ ifneq ($(strip $(DISTRIBUTE_DIR)),distribute) endif ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)) \ - $(addprefix $(BUILD_DIR)/cuda/, $(SRC_DIRS)) \ $(LIB_BUILD_DIR) $(TEST_BIN_DIR) $(PY_PROTO_BUILD_DIR) $(LINT_OUTPUT_DIR) \ $(DISTRIBUTE_SUBDIRS) $(PROTO_BUILD_INCLUDE_DIR)) @@ -235,7 +207,7 @@ DOXYGEN_SOURCES := $(shell find \ matlab/ \ examples \ tools \ - -name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh" -or \ + -name "*.cpp" -or -name "*.hpp" -or \ -name "*.py" -or -name "*.m") DOXYGEN_SOURCES += $(DOXYGEN_CONFIG_FILE) @@ -271,13 +243,8 @@ endif ifeq ($(OSX), 1) CXX := /usr/bin/clang++ ifneq ($(CPU_ONLY), 1) - CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release \d' | grep -o '\d') - ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1) - CXXFLAGS += -stdlib=libstdc++ - LINKFLAGS += -stdlib=libstdc++ - endif - # clang throws this warning for cuda headers - WARNINGS += -Wno-unneeded-internal-declaration + # todo + ############# endif # gtest needs to use its own tuple to not conflict with clang COMMON_FLAGS += -DGTEST_USE_OWN_TR1_TUPLE=1 @@ -313,12 +280,6 @@ else COMMON_FLAGS += -DNDEBUG -O2 endif -# cuDNN acceleration configuration. -ifeq ($(USE_CUDNN), 1) - LIBRARIES += cudnn - COMMON_FLAGS += -DUSE_CUDNN -endif - # CPU-only configuration ifeq ($(CPU_ONLY), 1) OBJS := $(PROTO_OBJS) $(CXX_OBJS) @@ -403,7 +364,7 @@ PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library)) # # * Recursive with the exception that symbolic links are never followed, per the # default behavior of 'find'. -SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py .cuo +SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py # Set the sub-targets of the 'everything' target. EVERYTHING_TARGETS := all py$(PROJECT) test warn lint @@ -554,26 +515,12 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \ || (cat $@.$(WARNS_EXT); exit 1) @ cat $@.$(WARNS_EXT) -#$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS) -# @ echo NVCC $< -# $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ -# -odir $(@D) -# $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \ -# || (cat $@.$(WARNS_EXT); exit 1) -# @ cat $@.$(WARNS_EXT) - $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo CXX/LD -o $@ $< $(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib -$(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \ - $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) - @ echo LD $< - $(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \ - -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib - $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \ $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo LD $< diff --git a/Makefile.config b/Makefile.config index 2d8124d6..829e2732 100644 --- a/Makefile.config +++ b/Makefile.config @@ -12,14 +12,14 @@ # CUSTOM_CXX := g++ # CUDA directory contains bin/ and lib/ directories that we need. -CUDA_DIR := /usr/local/cuda +#CUDA_DIR := /usr/local/cuda # On Ubuntu 14.04, if cuda tools are installed via # "sudo apt-get install nvidia-cuda-toolkit" then use this instead: # CUDA_DIR := /usr # CUDA architecture setting: going with all of them. # For CUDA < 6.0, comment the *_50 lines for compatibility. -CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ +#CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ -gencode arch=compute_20,code=sm_21 \ -gencode arch=compute_30,code=sm_30 \ -gencode arch=compute_35,code=sm_35 \ From 4e424b45014446459e2142ffb9a0dd24512e56be Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 25 Aug 2015 14:36:34 +0800 Subject: [PATCH 038/124] modified the packing number --- include/caffe/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 4cd372a6..8113c181 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -84,7 +84,7 @@ private:\ #define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -#define global_packing_N 32 +#define global_packing_N 16 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ From 02762d4d22125a59673e30e50da2fb5da07b6927 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 27 Aug 2015 23:22:31 +0800 Subject: [PATCH 039/124] add clFinish in test --- tools/caffe.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/caffe.cpp b/tools/caffe.cpp index e350866f..d7953bdd 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -7,6 +7,7 @@ #include "boost/algorithm/string.hpp" #include "caffe/caffe.hpp" +#include "caffe/device.hpp" using caffe::Blob; using caffe::Caffe; @@ -15,7 +16,7 @@ using caffe::Layer; using caffe::shared_ptr; using caffe::Timer; using caffe::vector; - +using caffe::amdDevice; DEFINE_int32(gpu, -1, "Run in GPU mode on given device ID."); @@ -117,7 +118,7 @@ int train() { LOG(INFO) << "Use CPU."; Caffe::set_mode(Caffe::CPU); } - + LOG(INFO) << "Starting Optimization"; shared_ptr > solver(caffe::GetSolver(solver_param)); @@ -246,6 +247,9 @@ int time() { std::vector backward_time_per_layer(layers.size(), 0.0); double forward_time = 0.0; double backward_time = 0.0; + + clFinish(amdDevice.CommandQueue); + for (int j = 0; j < FLAGS_iterations; ++j) { Timer iter_timer; iter_timer.Start(); @@ -253,6 +257,9 @@ int time() { for (int i = 0; i < layers.size(); ++i) { timer.Start(); layers[i]->Forward(bottom_vecs[i], top_vecs[i]); + + clFinish(amdDevice.CommandQueue); + forward_time_per_layer[i] += timer.MicroSeconds(); } forward_time += forward_timer.MicroSeconds(); @@ -261,6 +268,9 @@ int time() { timer.Start(); layers[i]->Backward(top_vecs[i], bottom_need_backward[i], bottom_vecs[i]); + + clFinish(amdDevice.CommandQueue); + backward_time_per_layer[i] += timer.MicroSeconds(); } backward_time += backward_timer.MicroSeconds(); From 34401f6b35b45ecfd985c7754e2344a9f0526556 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 27 Aug 2015 23:34:56 +0800 Subject: [PATCH 040/124] fix cmake --- cmake/CaffeConfig.cmake | 61 +++++++++++++++++++++++++++++++ cmake/Dependencies.cmake | 23 +++++++----- cmake/OpenCL.cmake | 26 +++++++++++++ cmake/Summary.cmake | 2 + cmake/Templates/caffe_config.h.in | 4 ++ 5 files changed, 107 insertions(+), 9 deletions(-) create mode 100644 cmake/CaffeConfig.cmake create mode 100644 cmake/OpenCL.cmake diff --git a/cmake/CaffeConfig.cmake b/cmake/CaffeConfig.cmake new file mode 100644 index 00000000..076edc5d --- /dev/null +++ b/cmake/CaffeConfig.cmake @@ -0,0 +1,61 @@ +# Config file for the Caffe package. +# +# Note: +# Caffe and this config file depends on opencv, +# so put `find_package(OpenCV)` before searching Caffe +# via `find_package(Caffe)`. All other lib/includes +# dependencies are hard coded in the file +# +# After successful configuration the following variables +# will be defined: +# +# Caffe_INCLUDE_DIRS - Caffe include directories +# Caffe_LIBRARIES - libraries to link against +# Caffe_DEFINITIONS - a list of definitions to pass to compiler +# +# Caffe_HAVE_CUDA - signals about CUDA support +# Caffe_HAVE_CUDNN - signals about cuDNN support + + +# OpenCV dependency + +if(NOT OpenCV_FOUND) + set(Caffe_OpenCV_CONFIG_PATH "/usr/local/share/OpenCV") + if(Caffe_OpenCV_CONFIG_PATH) + get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE) + + if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core) + message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}") + include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake) + endif() + + else() + find_package(OpenCV REQUIRED) + endif() + unset(Caffe_OpenCV_CONFIG_PATH) +endif() + +# Compute paths +get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +set(Caffe_INCLUDE_DIRS "/usr/local/include;/usr/include;/opt/AMDAPPSDK-2.9-1/include;/opt/clBLAS-2.1/include;/usr/local/include/opencv;/usr/include/atlas") + +get_filename_component(__caffe_include "${Caffe_CMAKE_DIR}/../../include" ABSOLUTE) +list(APPEND Caffe_INCLUDE_DIRS ${__caffe_include}) +unset(__caffe_include) + + +# Our library dependencies +if(NOT TARGET caffe AND NOT caffe_BINARY_DIR) + include("${Caffe_CMAKE_DIR}/CaffeTargets.cmake") +endif() + +# List of IMPORTED libs created by CaffeTargets.cmake +set(Caffe_LIBRARIES caffe) + +# Definitions +set(Caffe_DEFINITIONS "-DCPU_ONLY") + +# Cuda support variables +set(Caffe_CPU_ONLY OFF) +set(Caffe_HAVE_CUDA FALSE) +set(Caffe_HAVE_CUDNN FALSE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 7c86dd55..c4026084 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -44,17 +44,22 @@ include_directories(SYSTEM ${Snappy_INCLUDE_DIR}) list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES}) # ---[ CUDA -include(cmake/Cuda.cmake) -if(NOT HAVE_CUDA) - if(CPU_ONLY) - message("-- CUDA is disabled. Building without it...") - else() - message("-- CUDA is not detected by cmake. Building without it...") - endif() +#include(cmake/Cuda.cmake) +#if(NOT HAVE_CUDA) +# if(CPU_ONLY) +# message("-- CUDA is disabled. Building without it...") +# else() +# message("-- CUDA is not detected by cmake. Building without it...") +# endif() # TODO: remove this not cross platform define in future. Use caffe_config.h instead. - add_definitions(-DCPU_ONLY) -endif() +# add_definitions(-DCPU_ONLY) +#endif() + +# ---[ OpenCL +include(cmake/OpenCL.cmake) +include_directories(SYSTEM ${OCL_INCLUDE_DIR} ${CLBLAS_INCLUDE_DIR}) +list(APPEND Caffe_LINKER_LIBS ${OCL_LIBRARIES} ${CLBLAS_LIBRARIES}) # ---[ OpenCV find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake new file mode 100644 index 00000000..e6d94642 --- /dev/null +++ b/cmake/OpenCL.cmake @@ -0,0 +1,26 @@ +if(CPU_ONLY) + return() +endif() + +#find_path(OCL_INCLUDE_DIR NAMES CL/cl.h PATHS "$ENV{AMDAPPSDKROOT}/include") +#find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS "$ENV{AMDAPPSDKROOT}/lib/x86_64") + +#find_path(CLBLAS_INCLUDE_DIR NAMES clBLAS.h PATHS /opt/clBLAS-2.1/include $ENV{C_INCLUDE_PATH} $ENV{CPLUS_INCLUDE_PATH}) +#find_library(CLBLAS_LIBRARIES NAMES libclBLAS.so PATHS $ENV{LD_LIBRARY_PATH}) + +#if(OCL_INCLUDE_DIR AND OCL_LIBRARIES) +# set(OCL_FOUND TRUE PARENT_SCOPE) +# message(STATUS "Found OpenCL (include: ${OCL_INCLUDE_DIR}, library: ${OCL_LIBRARIES})") +#endif() + +#if(CLBLAS_INCLUDE_DIR AND CLBLAS_LIBRARIES) +# set(CLBLAS_FOUND TRUE PARENT_SCOPE) +#endif() + +set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include) +set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so) +set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include) +set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so) + + + diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index e094ac00..19782add 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -126,6 +126,8 @@ function(caffe_print_configuration_summary) caffe_status(" LevelDB : " LEVELDB_FOUND THEN "Yes (ver. ${LEVELDB_VERSION})" ELSE "No") caffe_status(" OpenCV : Yes (ver. ${OpenCV_VERSION})") caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) +# caffe_status(" OpenCL : " OCL_FOUND THEN "Yes" ELSE "No") +# caffe_status(" clBLAS : " CLBLAS_FOUND THEN "Yes" ELSE "No") caffe_status("") if(HAVE_CUDA) caffe_status("NVIDIA CUDA:") diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in index 6039e8f6..ca9a3a9a 100644 --- a/cmake/Templates/caffe_config.h.in +++ b/cmake/Templates/caffe_config.h.in @@ -14,6 +14,10 @@ /* NVIDA cuDNN */ #cmakedefine CPU_ONLY +/* OpenCL & clBLAS*/ +#cmakedefine OCL_FOUND +#cmakedefine CLBLAS_FOUND + /* Test device */ #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE} From dfa3955728b3edcb29034e7dd36ba4590fc78eea Mon Sep 17 00:00:00 2001 From: Noplz Date: Fri, 28 Aug 2015 14:58:45 +0800 Subject: [PATCH 041/124] Remove cuda related code --- include/caffe/common.hpp | 12 ++++++------ include/caffe/util/benchmark.hpp | 4 ++-- include/caffe/util/device_alternate.hpp | 18 +++++++++--------- src/caffe/common.cpp | 12 ++++++------ src/caffe/util/math_functions.cpp | 2 +- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 8113c181..b93e0d6d 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -218,10 +218,10 @@ class Caffe { return *(Get().random_generator_); } #ifndef CPU_ONLY - inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } - inline static curandGenerator_t curand_generator() { - return Get().curand_generator_; - } + //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } + //inline static curandGenerator_t curand_generator() { + // return Get().curand_generator_; + //} #endif // Returns the mode: running on CPU or GPU. @@ -245,8 +245,8 @@ class Caffe { protected: #ifndef CPU_ONLY - cublasHandle_t cublas_handle_; - curandGenerator_t curand_generator_; + //cublasHandle_t cublas_handle_; + //curandGenerator_t curand_generator_; #endif shared_ptr random_generator_; diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp index d6358277..890f31bf 100644 --- a/include/caffe/util/benchmark.hpp +++ b/include/caffe/util/benchmark.hpp @@ -28,8 +28,8 @@ class Timer { bool running_; bool has_run_at_least_once_; #ifndef CPU_ONLY - cudaEvent_t start_gpu_; - cudaEvent_t stop_gpu_; + //cudaEvent_t start_gpu_; + //cudaEvent_t stop_gpu_; #endif boost::posix_time::ptime start_cpu_; boost::posix_time::ptime stop_cpu_; diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index 6ea595db..9184f4f9 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -31,11 +31,11 @@ void classname::funcname##_##gpu(const vector*>& top, \ #else // Normal GPU + CPU Caffe. -#include -#include -#include -#include -#include // cuda driver types +//#include +//#include +//#include +//#include +//#include // cuda driver types #ifdef USE_CUDNN // cuDNN acceleration library. #include "caffe/util/cudnn.hpp" #endif @@ -45,8 +45,8 @@ void classname::funcname##_##gpu(const vector*>& top, \ // // CUDA: various checks for different function calls. +/* #define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ do { \ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ @@ -74,12 +74,12 @@ void classname::funcname##_##gpu(const vector*>& top, \ // CUDA: check for error after kernel execution and exit loudly if there is one. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) - +*/ namespace caffe { // CUDA: library error reporting. -const char* cublasGetErrorString(cublasStatus_t error); -const char* curandGetErrorString(curandStatus_t error); +//const char* cublasGetErrorString(cublasStatus_t error); +//const char* curandGetErrorString(curandStatus_t error); // CUDA: thread number configuration. // Use 1024 threads per block, which requires cuda sm_2x or above, diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 5d56493b..3891852a 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -223,7 +223,7 @@ void* Caffe::RNG::generator() { return static_cast(generator_->rng()); } -const char* cublasGetErrorString(cublasStatus_t error) { +//const char* cublasGetErrorString(cublasStatus_t error) { /* switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; @@ -251,10 +251,10 @@ const char* cublasGetErrorString(cublasStatus_t error) { #endif } */ - return "Unknown cublas status"; -} +// return "Unknown cublas status"; +//} -const char* curandGetErrorString(curandStatus_t error) { +//const char* curandGetErrorString(curandStatus_t error) { /*switch (error) { case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; @@ -284,8 +284,8 @@ const char* curandGetErrorString(curandStatus_t error) { return "CURAND_STATUS_INTERNAL_ERROR"; } */ - return "Unknown curand status"; -} + // return "Unknown curand status"; +//} #endif // CPU_ONLY diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 3bef8b63..d48ec01a 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -765,7 +765,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY // NOLINT_NEXT_LINE(caffe/alt_fn) - CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); + //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); #else NO_GPU; #endif From 415f603ec57b7f1f35cfe361fba8f0ff09ba1023 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 30 Aug 2015 16:36:53 +0800 Subject: [PATCH 042/124] add FindOpenCL and FindclBLAS in cmake/ --- cmake/Dependencies.cmake | 11 +++- cmake/Modules/FindOpenCL.cmake | 108 +++++++++++++++++++++++++++++++++ cmake/Modules/FindclBLAS.cmake | 98 ++++++++++++++++++++++++++++++ cmake/OpenCL.cmake | 4 +- cmake/Summary.cmake | 6 +- 5 files changed, 219 insertions(+), 8 deletions(-) create mode 100644 cmake/Modules/FindOpenCL.cmake create mode 100644 cmake/Modules/FindclBLAS.cmake diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index c4026084..eb72e89f 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -57,9 +57,14 @@ list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES}) #endif() # ---[ OpenCL -include(cmake/OpenCL.cmake) -include_directories(SYSTEM ${OCL_INCLUDE_DIR} ${CLBLAS_INCLUDE_DIR}) -list(APPEND Caffe_LINKER_LIBS ${OCL_LIBRARIES} ${CLBLAS_LIBRARIES}) +find_package(OpenCL REQUIRED) +include_directories(SYSTEM ${OPENCL_INCLUDE_DIRS}) +list(APPEND Caffe_LINKER_LIBS ${OPENCL_LIBRARIES}) + +# ---[ clBLAS +find_package(clBLAS REQUIRED) +include_directories(SYSTEM ${CLBLAS_INCLUDE_DIRS}) +list(APPEND Caffe_LINKER_LIBS ${CLBLAS_LIBRARIES}) # ---[ OpenCV find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake new file mode 100644 index 00000000..7c23701d --- /dev/null +++ b/cmake/Modules/FindOpenCL.cmake @@ -0,0 +1,108 @@ +# ######################################################################## +# Copyright 2013 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ######################################################################## + + +# Locate an OpenCL implementation. +# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) +# +# Defines the following variables: +# +# OPENCL_FOUND - Found the OPENCL framework +# OPENCL_INCLUDE_DIRS - Include directories +# +# Also defines the library variables below as normal +# variables. These contain debug/optimized keywords when +# a debugging library is found. +# +# OPENCL_LIBRARIES - libopencl +# +# Accepts the following variables as input: +# +# OPENCL_ROOT - (as a CMake or environment variable) +# The root directory of the OpenCL implementation found +# +# FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for +# 64bit or 32bit libs +#----------------------- +# Example Usage: +# +# find_package(OPENCL REQUIRED) +# include_directories(${OPENCL_INCLUDE_DIRS}) +# +# add_executable(foo foo.cc) +# target_link_libraries(foo ${OPENCL_LIBRARIES}) +# +#----------------------- + +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) + +find_path(OPENCL_INCLUDE_DIRS + NAMES OpenCL/cl.h CL/cl.h + HINTS + ${OPENCL_ROOT}/include + $ENV{AMDAPPSDKROOT}/include + $ENV{CUDA_PATH}/include + PATHS + /usr/include + /usr/local/include + /usr/local/cuda/include + /opt/cuda/include + DOC "OpenCL header file path" +) +mark_as_advanced( OPENCL_INCLUDE_DIRS ) + +# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else +get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) + +if( LIB64 ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86_64 x64 + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + ) +else( ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86 Win32 + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + ) +endif( ) +mark_as_advanced( OPENCL_LIBRARIES ) + +include( FindPackageHandleStandardArgs ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) + +if( NOT OPENCL_FOUND ) + message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +else () + message( STATUS "Found OpenCL (include: ${OPENCL_INCLUDE_DIRS}, library: ${OPENCL_LIBRARIES})") +endif() diff --git a/cmake/Modules/FindclBLAS.cmake b/cmake/Modules/FindclBLAS.cmake new file mode 100644 index 00000000..1fa28762 --- /dev/null +++ b/cmake/Modules/FindclBLAS.cmake @@ -0,0 +1,98 @@ +# ######################################################################## +# Copyright 2013 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ######################################################################## + + +# Locate an clBLAS library. +# +# Defines the following variables: +# +# CLBLAS_FOUND - Found the CLBLAS library +# CLBLAS_INCLUDE_DIRS - Include directories +# +# Also defines the library variables below as normal +# variables. These contain debug/optimized keywords when +# a debugging library is found. +# +# CLBLAS_LIBRARIES - libclBLAS +# +# Accepts the following variables as input: +# +# CLBLAS_ROOT - (as a CMake or environment variable) +# The root directory of the clBLAS library found +# +# FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findclBLAS should search for +# 64bit or 32bit libs +#----------------------- +# Example Usage: +# +# find_package(clBLAS REQUIRED) +# include_directories(${CLBLAS_INCLUDE_DIRS}) +# +# add_executable(foo foo.cc) +# target_link_libraries(foo ${CLBLAS_LIBRARIES}) +# +#----------------------- + +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) + +find_path(CLBLAS_INCLUDE_DIRS NAMES clBLAS.h + HINTS + $ENV{CLBLAS_ROOT}/include + PATHS + /usr/include + /usr/local/include + DOC "clBLAS header file path" +) +mark_as_advanced( CLBLAS_INCLUDE_DIRS ) + +# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else +get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) + +if( LIB64 ) + find_library( CLBLAS_LIBRARIES + NAMES clBLAS + HINTS + $ENV{CLBLAS_ROOT}/lib64 + DOC "clBLAS dynamic library path" + PATHS + /usr/lib + /usr/local/lib + ) +else( ) + find_library( CLBLAS_LIBRARIES + NAMES clBLAS + HINTS + $ENV{CLBLAS_ROOT}/lib + DOC "clBLAS dynamic library path" + PATHS + /usr/lib + /usr/local/lib + ) +endif( ) +mark_as_advanced( CLBLAS_LIBRARIES ) + +if (NOT CLBLAS_INCLUDE_DIRS) + set(CLBLAS_FOUND ON) +endif() + +include( FindPackageHandleStandardArgs ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS ) + +if( NOT CLBLAS_FOUND ) + message( STATUS "FindclBLAS looked for libraries named: clBLAS" ) +else () + message( STATUS "Found clBLAS (include: ${CLBLAS_INCLUDE_DIRS}, library: ${CLBLAS_LIBRARIES})") +endif() diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake index e6d94642..c83ce7eb 100644 --- a/cmake/OpenCL.cmake +++ b/cmake/OpenCL.cmake @@ -17,8 +17,8 @@ endif() # set(CLBLAS_FOUND TRUE PARENT_SCOPE) #endif() -set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include) -set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so) +#set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include) +#set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so) set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include) set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 19782add..2d95b0a9 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -125,9 +125,9 @@ function(caffe_print_configuration_summary) caffe_status(" Snappy : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" ) caffe_status(" LevelDB : " LEVELDB_FOUND THEN "Yes (ver. ${LEVELDB_VERSION})" ELSE "No") caffe_status(" OpenCV : Yes (ver. ${OpenCV_VERSION})") - caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) -# caffe_status(" OpenCL : " OCL_FOUND THEN "Yes" ELSE "No") -# caffe_status(" clBLAS : " CLBLAS_FOUND THEN "Yes" ELSE "No") +# caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) + caffe_status(" OpenCL : " OPENCL_FOUND THEN "Yes" ELSE "No") + caffe_status(" clBLAS : " CLBLAS_FOUND THEN "Yes" ELSE "No") caffe_status("") if(HAVE_CUDA) caffe_status("NVIDIA CUDA:") From 17104ed502ae133bab908aa967c4dc1e395ca26f Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 14:50:21 +0800 Subject: [PATCH 043/124] Fixed conv layers opt2 bug --- include/caffe/common.hpp | 2 +- include/caffe/vision_layers.hpp | 8 ++- src/caffe/device.cpp | 42 +---------- src/caffe/layers/base_conv_layer.cpp | 101 ++++++++------------------- src/caffe/layers/conv_layer.cpp | 47 ++++--------- 5 files changed, 52 insertions(+), 148 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index b93e0d6d..97d1a985 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -81,7 +81,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 237e9cbf..2f2d7eef 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -105,6 +105,7 @@ class BaseConvolutionLayer : public Layer { col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); } + protected: inline void conv_im2col_gpu_opt(const Dtype* data) { im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); @@ -113,11 +114,12 @@ class BaseConvolutionLayer : public Layer { col2im_gpu_opt((Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); } + private: inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { - transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); + transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2); } inline void conv_transpose_gpu(const Dtype* data){ - opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); + opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); } protected: inline void gpu_memset(Dtype* data, Dtype value, int count) { @@ -147,7 +149,7 @@ class BaseConvolutionLayer : public Layer { int weight_offset_; int col_offset_; int output_offset_; - int top_offset_, top_offset_n, bottom_offset_; + int top_offset_, top_offset_opt, bottom_offset_; public: static cl_mem subTopMem, transMem; static size_t subtop_mem_size, trans_mem_size; diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 7a866c11..960d8bf1 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -75,9 +75,7 @@ cl_int Device::Init(){ GetDeviceInfo(); cl_uint uiNumDevices; cl_bool unified_memory = false; -/* switch(Caffe::mode()) { - case Caffe::GPU: - //choose_gpu(); + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); uiNumDevices = numDevices; if(0 == uiNumDevices){ @@ -95,44 +93,6 @@ cl_int Device::Init(){ } } } - LOG(INFO) << "picked device type: GPU"; - break; - case Caffe::CPU: - //choose_cpu(); - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); - uiNumDevices = numDevices; - if(0 == uiNumDevices){ - LOG(FATAL) << "Err: No CPU devices"; - } - pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); - OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) ); - LOG(INFO) << "picked device type: CPU"; - break; -*/ -// case Caffe::APU: - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - uiNumDevices = numDevices; - if(0 == uiNumDevices){ - LOG(FATAL) << "Err: No GPU devices"; - } - else{ - pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); - OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices)); - for (int i = 0; i < (int)uiNumDevices; i++){ - clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); - if(unified_memory) //we pick the first GPU we found - pDevices[0] = pDevices[i]; - else {//skip dGPU - continue; - } - } - } - LOG(INFO) << "picked device type: APU"; - // break; - // default: - // LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - // } - //Create Context Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); if(NULL == Context){ diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 1c1379b3..faa7b63c 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -33,19 +33,9 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) template void BaseConvolutionLayer::ocl_setup() { -/* im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL); - col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL); - oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); - im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); - col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL); - opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL); - ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL); - ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL); -*/ - M_ = conv_out_channels_ / group_; - K_ = kernel_dim_ / group_; - N_ = conv_out_spatial_dim_; - + M_ = num_output_ / group_; + K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; + N_ = height_out_ * width_out_; #ifdef use_packing_scheme size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); @@ -56,15 +46,6 @@ void BaseConvolutionLayer::ocl_setup() { template BaseConvolutionLayer::~BaseConvolutionLayer(){ - /* - OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) ); - OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) ); - OCL_CHECK( clReleaseKernel(oclmem_kernel) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); - OCL_CHECK( clReleaseKernel(im2col_opt_kernel) ); - OCL_CHECK( clReleaseKernel(col2im_opt_kernel) ); -*/ } @@ -314,9 +295,10 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - conv_im2col_gpu_opt(input); + //conv_im2col_gpu_opt(input); + im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, + (Dtype*)transMem, 0, opt_num2); } - //col_buff = col_buffer_.gpu_data(); } #ifdef multiQ for (int g = 0; g < group_; ++g) { @@ -324,7 +306,7 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, else Queue = amdDevice.CommandQueue_helper; prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } if(group_ == 2){ clFinish(amdDevice.CommandQueue); @@ -335,10 +317,11 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, for (int g = 0; g < group_; ++g) { prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } #endif - conv_transform_gpu((Dtype*)subTopMem, output); + //conv_transform_gpu((Dtype*)subTopMem, output); + transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2); } @@ -358,7 +341,7 @@ void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., output, top_offset_n + num_output_ * N_ * z); + (Dtype)1., output, top_offset_ + num_output_ * N_ * z); } template @@ -371,7 +354,7 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, for (int g = 0; g < group_; ++g) { caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights, weight_offset_ * g, - output, top_offset_+output_offset_ * g, + output, top_offset_ + output_offset_ * g, (Dtype)0., col_buff, col_offset_ * g); } if (!is_1x1_) { @@ -382,7 +365,6 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, const Dtype* weights, Dtype* input) { - //Dtype* col_buff = col_buffer_.mutable_gpu_data(); cl_command_queue Queue; if (is_1x1_) { int count = height_ * width_ * conv_in_channels_ * opt_num2; @@ -395,9 +377,9 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #else Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_, + caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype)1., weights, weight_offset_ * g, - (Dtype*)subTopMem, top_offset_ * g, + (Dtype*)subTopMem, top_offset_opt * g, (Dtype)0., (Dtype*)transMem, col_offset_ * g); } #ifdef multiQ @@ -408,8 +390,10 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #endif if (!is_1x1_) { - conv_col2im_gpu_opt(input); - } + //conv_col2im_gpu_opt(input); + col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + stride_w_, input, bottom_offset_, opt_num2); + } } template @@ -433,10 +417,14 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* col_buff = input; cl_command_queue Queue; if (!is_1x1_) { - conv_im2col_gpu_opt(input); - //col_buff = col_buffer_.gpu_data(); + //conv_im2col_gpu_opt(input); + im2col_gpu_opt(input, bottom_offset_, channels_, height_, + width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } - conv_transpose_gpu(output); + //conv_transpose_gpu(output); + int height_top = M_ * group_, width_top = N_; + opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); + for (int g = 0; g < group_; ++g) { #ifdef multiQ @@ -445,8 +433,8 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, #else Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, - (Dtype)1., (Dtype*)subTopMem, top_offset_ * g, + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g, (Dtype*)transMem, col_offset_ * g, (Dtype)1., (Dtype*)weights, weight_offset_ * g); #ifdef multiQ @@ -461,10 +449,8 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input) { - /* caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.gpu_data(), 1., bias);*/ - caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_*width_out_, - (Dtype)1., input, top_offset_, height_out_*width_out_, + caffe_gpu_gemv(CblasNoTrans, num_output_, N_, + (Dtype)1., input, top_offset_, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias, (size_t)0, 1); } @@ -475,12 +461,9 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); Dtype* top_data = top[i]->mutable_gpu_data(); Dtype* col_data = col_buffer_.mutable_gpu_data(); - /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. - N' is the M in sgemm call.*/ int M_org = M_ * group_; int col_offset = K_ * N_; int top_offset = M_ * N_; @@ -488,19 +471,13 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo int opt_num2 = global_packing_N; cl_command_queue Queue; cl_event prof_event; - //LOG(INFO) << "conv_fp optimized scheme"; for (int n = 0; n < num_; n += opt_num2) { opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - /*col_offset is the offset for sgemm, including packing and groups - for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ top_offset = M_ * N_ * opt_num2; col_offset = K_ * N_ * opt_num2; - //step1: packed im2col, col_size = (K_ * group_ ) * N_ - //this should be opt_num2 images packing together. im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - //step 2: sgemm: Top (subTopMem) = weight * col_data #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; @@ -521,10 +498,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo (Dtype)0., (Dtype*)subTopMem, top_offset * g); } #endif - //step 3: tranform transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); - //step 4: add bias - /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/ for (int z = 0; z < opt_num2; z++) if (bias_term_) { @@ -551,7 +525,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, M_, N_, + caffe_gpu_gemv(CblasNoTrans, num_output_, N_, (Dtype)1., top_diff, top[i]->offset(n), N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias_diff, (size_t)0, 1); @@ -570,25 +544,17 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t int g = 0; cl_command_queue Queue; cl_event prof_event; - //LOG(INFO) << "conv_bp optimized scheme"; for (int n = 0; n < num_; n += opt_num2) { opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - /*col_offset is the offset for sgemm, including packing and groups - for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ top_offset = M_ * (N_ * opt_num2); col_offset = K_ * (N_ * opt_num2); - //step1: packed im2col, col_size = (K_ * group_ ) * N_ - //this should be opt_num2 images packing together. im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize int height_top = M_ * group_, width_top = N_; - //if (opt_num2 >1) opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); - //step 3: sgemm: Top (subTopMem) = weight * col_data for(g = 0; g < group_; ++g) { #ifdef multiQ if(g == 0) Queue = amdDevice.CommandQueue; @@ -602,7 +568,6 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t (Dtype*)weight_diff, weight_offset * g); } - //step4: if (propagate_down[i]) { for (g = 0; g < group_; ++g) { #ifdef multiQ @@ -624,14 +589,8 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t clFinish(amdDevice.CommandQueue_helper); } #endif - - //step5: col2im col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); -#ifdef Track_layer - LOG(WARNING) << "conv bp done"; -#endif - } } } diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 8f7d8f82..369fbacd 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -1,5 +1,4 @@ #include - #include "caffe/filler.hpp" #include "caffe/layer.hpp" #include "caffe/util/im2col.hpp" @@ -33,7 +32,7 @@ void ConvolutionLayer::Forward_cpu(const vector*>& bottom, } } -// CHECK_BLOB_DATA(top[0],20, "top[0]"); + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -67,9 +66,6 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } - //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff"); } @@ -80,7 +76,6 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, Forward_gpu_opt(bottom, top); else Forward_gpu_org(bottom, top); -// CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -97,11 +92,6 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); this->forward_gpu_opt(bottom, weight, top); - -#ifdef Track_layer - LOG(WARNING) << "conv fp done"; -#endif - } template @@ -114,14 +104,14 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto Dtype* top_data = top[i]->mutable_gpu_data(); this->opt_num2 = global_packing_N; + this->weight_offset_ = this->M_ * this->K_; for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; //intermediate variables to pass offset - this->top_offset_ = this->M_ * this->N_ * this->opt_num2; - this->top_offset_n = top[i]->offset(n); + this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; + this->top_offset_ = top[i]->offset(n); this->col_offset_ = this->K_ * this->N_ * this->opt_num2; this->bottom_offset_ = bottom[i]->offset(n); - this->weight_offset_ = this->M_ * this->K_; this->forward_gpu_gemm_opt(bottom_data, weight, top_data); if (this->bias_term_) { @@ -131,8 +121,8 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto } } - CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - CHECK_BLOB_DATA(top[0],20, "top[0]"); + //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } @@ -160,7 +150,7 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - // CHECK_BLOB_DATA(top[0],20, "top[0]"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -180,30 +170,31 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - this->gpu_memset(bias_diff, 0., this->blobs_[1]->count()); + ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < this->num_; ++n) { - // this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); this->backward_gpu_bias(bias_diff, top_diff); } - } + } if (this->param_propagate_down_[0] || propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); this->weight_offset_ = this->M_ * this->K_; this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; ++n) { + for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; - this->top_offset_n = top[i]->offset(n); + this->top_offset_ = top[i]->offset(n); this->bottom_offset_ = bottom[i]->offset(n); this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_ = this->M_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); } + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm_opt(top_diff, weight, @@ -213,10 +204,6 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, } } - CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, @@ -256,10 +243,6 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } } -// CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); -// CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); -// CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY From 33b8282220b218da52e8b4c738ca680a887e7dcc Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 15:29:44 +0800 Subject: [PATCH 044/124] conv clean up --- src/caffe/layers/base_conv_layer.cpp | 23 +++++++++-------------- src/caffe/layers/conv_layer.cpp | 7 ++----- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index faa7b63c..6071c49b 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -295,9 +295,9 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - //conv_im2col_gpu_opt(input); - im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, - (Dtype*)transMem, 0, opt_num2); + conv_im2col_gpu_opt(input); + // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, + // (Dtype*)transMem, 0, opt_num2); } } #ifdef multiQ @@ -390,9 +390,9 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #endif if (!is_1x1_) { - //conv_col2im_gpu_opt(input); - col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - stride_w_, input, bottom_offset_, opt_num2); + conv_col2im_gpu_opt(input); + // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + // stride_w_, input, bottom_offset_, opt_num2); } } @@ -414,12 +414,11 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; cl_command_queue Queue; if (!is_1x1_) { - //conv_im2col_gpu_opt(input); - im2col_gpu_opt(input, bottom_offset_, channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + conv_im2col_gpu_opt(input); + //im2col_gpu_opt(input, bottom_offset_, channels_, height_, + // width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } //conv_transpose_gpu(output); int height_top = M_ * group_, width_top = N_; @@ -462,8 +461,6 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* top_data = top[i]->mutable_gpu_data(); - - Dtype* col_data = col_buffer_.mutable_gpu_data(); int M_org = M_ * group_; int col_offset = K_ * N_; int top_offset = M_ * N_; @@ -535,8 +532,6 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t if (this->param_propagate_down_[0] || propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - Dtype* col_diff = col_buffer_.mutable_gpu_diff(); int col_offset = K_ * N_; int top_offset = M_ * N_; int weight_offset = M_ * K_; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 369fbacd..020098aa 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -73,7 +73,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt(bottom, top); + Forward_gpu_opt2(bottom, top); else Forward_gpu_org(bottom, top); } @@ -82,7 +82,7 @@ template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N >1) - Backward_gpu_opt(top, propagate_down, bottom); + Backward_gpu_opt2(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); } @@ -192,9 +192,6 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); } - this->bottom_offset_ = bottom[i]->offset(n); - this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm_opt(top_diff, weight, From 40cdc3e41abcde36b8816f3ce3556c230633992f Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 22:52:23 +0800 Subject: [PATCH 045/124] removed all cuDNN files --- src/caffe/layers/base_conv_layer.cpp | 6 - src/caffe/layers/cudnn_conv_layer.cpp | 130 ------ src/caffe/layers/cudnn_pooling_layer.cpp | 50 --- src/caffe/layers/cudnn_relu_layer.cpp | 46 --- src/caffe/layers/cudnn_sigmoid_layer.cpp | 46 --- src/caffe/layers/cudnn_softmax_layer.cpp | 50 --- src/caffe/layers/cudnn_tanh_layer.cpp | 46 --- src/caffe/layers/cufiles/absval_layer.cu | 33 -- src/caffe/layers/cufiles/base_data_layer.cu | 30 -- src/caffe/layers/cufiles/bnll_layer.cu | 60 --- src/caffe/layers/cufiles/concat_layer.cu | 71 ---- .../layers/cufiles/contrastive_loss_layer.cu | 111 ----- src/caffe/layers/cufiles/conv_layer.cu | 64 --- src/caffe/layers/cufiles/cudnn_conv_layer.cu | 160 -------- .../layers/cufiles/cudnn_pooling_layer.cu | 45 -- src/caffe/layers/cufiles/cudnn_relu_layer.cu | 57 --- .../layers/cufiles/cudnn_sigmoid_layer.cu | 47 --- .../layers/cufiles/cudnn_softmax_layer.cu | 48 --- src/caffe/layers/cufiles/cudnn_tanh_layer.cu | 48 --- src/caffe/layers/cufiles/deconv_layer.cu | 64 --- src/caffe/layers/cufiles/dropout_layer.cu | 77 ---- src/caffe/layers/cufiles/eltwise_layer.cu | 135 ------ .../layers/cufiles/euclidean_loss_layer.cu | 44 -- src/caffe/layers/cufiles/exp_layer.cu | 44 -- src/caffe/layers/cufiles/filter_layer.cu | 70 ---- src/caffe/layers/cufiles/hdf5_data_layer.cu | 53 --- src/caffe/layers/cufiles/hdf5_output_layer.cu | 43 -- src/caffe/layers/cufiles/im2col_layer.cu | 37 -- .../layers/cufiles/inner_product_layer.cu | 57 --- src/caffe/layers/cufiles/log_layer.cu | 57 --- src/caffe/layers/cufiles/lrn_layer.cu | 203 --------- src/caffe/layers/cufiles/mvn_layer.cu | 124 ------ src/caffe/layers/cufiles/pooling_layer.cu | 387 ------------------ src/caffe/layers/cufiles/power_layer.cu | 87 ---- src/caffe/layers/cufiles/prelu_layer.cu | 128 ------ src/caffe/layers/cufiles/reduction_layer.cu | 93 ----- src/caffe/layers/cufiles/relu_layer.cu | 65 --- .../sigmoid_cross_entropy_loss_layer.cu | 37 -- src/caffe/layers/cufiles/sigmoid_layer.cu | 62 --- src/caffe/layers/cufiles/silence_layer.cu | 28 -- src/caffe/layers/cufiles/slice_layer.cu | 71 ---- src/caffe/layers/cufiles/softmax_layer.cu | 149 ------- .../layers/cufiles/softmax_loss_layer.cu | 125 ------ src/caffe/layers/cufiles/split_layer.cu | 38 -- src/caffe/layers/cufiles/tanh_layer.cu | 59 --- src/caffe/layers/cufiles/threshold_layer.cu | 33 -- 46 files changed, 3518 deletions(-) delete mode 100644 src/caffe/layers/cudnn_conv_layer.cpp delete mode 100644 src/caffe/layers/cudnn_pooling_layer.cpp delete mode 100644 src/caffe/layers/cudnn_relu_layer.cpp delete mode 100644 src/caffe/layers/cudnn_sigmoid_layer.cpp delete mode 100644 src/caffe/layers/cudnn_softmax_layer.cpp delete mode 100644 src/caffe/layers/cudnn_tanh_layer.cpp delete mode 100644 src/caffe/layers/cufiles/absval_layer.cu delete mode 100644 src/caffe/layers/cufiles/base_data_layer.cu delete mode 100644 src/caffe/layers/cufiles/bnll_layer.cu delete mode 100644 src/caffe/layers/cufiles/concat_layer.cu delete mode 100644 src/caffe/layers/cufiles/contrastive_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/conv_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_conv_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_pooling_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_relu_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_softmax_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_tanh_layer.cu delete mode 100644 src/caffe/layers/cufiles/deconv_layer.cu delete mode 100644 src/caffe/layers/cufiles/dropout_layer.cu delete mode 100644 src/caffe/layers/cufiles/eltwise_layer.cu delete mode 100644 src/caffe/layers/cufiles/euclidean_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/exp_layer.cu delete mode 100644 src/caffe/layers/cufiles/filter_layer.cu delete mode 100644 src/caffe/layers/cufiles/hdf5_data_layer.cu delete mode 100644 src/caffe/layers/cufiles/hdf5_output_layer.cu delete mode 100644 src/caffe/layers/cufiles/im2col_layer.cu delete mode 100644 src/caffe/layers/cufiles/inner_product_layer.cu delete mode 100644 src/caffe/layers/cufiles/log_layer.cu delete mode 100644 src/caffe/layers/cufiles/lrn_layer.cu delete mode 100644 src/caffe/layers/cufiles/mvn_layer.cu delete mode 100644 src/caffe/layers/cufiles/pooling_layer.cu delete mode 100644 src/caffe/layers/cufiles/power_layer.cu delete mode 100644 src/caffe/layers/cufiles/prelu_layer.cu delete mode 100644 src/caffe/layers/cufiles/reduction_layer.cu delete mode 100644 src/caffe/layers/cufiles/relu_layer.cu delete mode 100644 src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/sigmoid_layer.cu delete mode 100644 src/caffe/layers/cufiles/silence_layer.cu delete mode 100644 src/caffe/layers/cufiles/slice_layer.cu delete mode 100644 src/caffe/layers/cufiles/softmax_layer.cu delete mode 100644 src/caffe/layers/cufiles/softmax_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/split_layer.cu delete mode 100644 src/caffe/layers/cufiles/tanh_layer.cu delete mode 100644 src/caffe/layers/cufiles/threshold_layer.cu diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 6071c49b..19458185 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -296,8 +296,6 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu_opt(input); - // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, - // (Dtype*)transMem, 0, opt_num2); } } #ifdef multiQ @@ -391,8 +389,6 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, if (!is_1x1_) { conv_col2im_gpu_opt(input); - // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - // stride_w_, input, bottom_offset_, opt_num2); } } @@ -417,8 +413,6 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, cl_command_queue Queue; if (!is_1x1_) { conv_im2col_gpu_opt(input); - //im2col_gpu_opt(input, bottom_offset_, channels_, height_, - // width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } //conv_transpose_gpu(output); int height_top = M_ * group_, width_top = N_; diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp deleted file mode 100644 index 104d2b9d..00000000 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// Set to three for the benefit of the backward pass, which -// can use separate streams for calculating the gradient w.r.t. -// bias, filter weights, and bottom data for each group independently -#define CUDNN_STREAMS_PER_GROUP 3 - -/** - * TODO(dox) explain cuDNN interface - */ -template -void CuDNNConvolutionLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::LayerSetUp(bottom, top); - // Initialize CUDA streams and cuDNN. - stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - workspaceSizeInBytes = 0; - workspace = NULL; - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - CUDA_CHECK(cudaStreamCreate(&stream_[g])); - CUDNN_CHECK(cudnnCreate(&handle_[g])); - CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); - } - - // Set the indexing parameters. - weight_offset_ = (this->num_output_ / this->group_) - * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; - bias_offset_ = (this->num_output_ / this->group_); - - // Create filter descriptor. - cudnn::createFilterDesc(&filter_desc_, - this->num_output_ / this->group_, this->channels_ / this->group_, - this->kernel_h_, this->kernel_w_); - - // Create tensor descriptor(s) for data and corresponding convolution(s). - for (int i = 0; i < bottom.size(); i++) { - cudnnTensorDescriptor_t bottom_desc; - cudnn::createTensor4dDesc(&bottom_desc); - bottom_descs_.push_back(bottom_desc); - cudnnTensorDescriptor_t top_desc; - cudnn::createTensor4dDesc(&top_desc); - top_descs_.push_back(top_desc); - cudnnConvolutionDescriptor_t conv_desc; - cudnn::createConvolutionDesc(&conv_desc); - conv_descs_.push_back(conv_desc); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::createTensor4dDesc(&bias_desc_); - } - - handles_setup_ = true; -} - -template -void CuDNNConvolutionLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::Reshape(bottom, top); - bottom_offset_ = (this->channels_ / this->group_) - * this->height_ * this->width_; - top_offset_ = (this->num_output_ / this->group_) - * this->height_out_ * this->width_out_; - - for (int i = 0; i < bottom.size(); i++) { - cudnn::setTensor4dDesc(&bottom_descs_[i], - this->num_, - this->channels_ / this->group_, - this->height_, this->width_, - this->channels_ * this->height_ * this->width_, - this->height_ * this->width_, - this->width_, 1); - cudnn::setTensor4dDesc(&top_descs_[i], - this->num_, - this->num_output_ / this->group_, - this->height_out_, this->width_out_, - this->num_output_ * this->height_out_ * this->width_out_, - this->height_out_ * this->width_out_, - this->width_out_, 1); - cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i], - filter_desc_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::setTensor4dDesc(&bias_desc_, - 1, this->num_output_ / this->group_, 1, 1); - } -} - -template -CuDNNConvolutionLayer::~CuDNNConvolutionLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - for (int i = 0; i < bottom_descs_.size(); i++) { - cudnnDestroyTensorDescriptor(bottom_descs_[i]); - cudnnDestroyTensorDescriptor(top_descs_[i]); - cudnnDestroyConvolutionDescriptor(conv_descs_[i]); - } - if (this->bias_term_) { - cudnnDestroyTensorDescriptor(bias_desc_); - } - cudnnDestroyFilterDescriptor(filter_desc_); - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - cudaStreamDestroy(stream_[g]); - cudnnDestroy(handle_[g]); - } - - delete [] stream_; - delete [] handle_; -} - -INSTANTIATE_CLASS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp deleted file mode 100644 index c92c4e47..00000000 --- a/src/caffe/layers/cudnn_pooling_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::LayerSetUp(bottom, top); - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - cudnn::createPoolingDesc(&pooling_desc_, - this->layer_param_.pooling_param().pool(), &mode_, - this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - handles_setup_ = true; -} - -template -void CuDNNPoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::Reshape(bottom, top); - cudnn::setTensor4dDesc(&bottom_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - cudnn::setTensor4dDesc(&top_desc_, bottom[0]->num(), - this->channels_, this->pooled_height_, this->pooled_width_); -} - -template -CuDNNPoolingLayer::~CuDNNPoolingLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroyPoolingDescriptor(pooling_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp deleted file mode 100644 index 759d8398..00000000 --- a/src/caffe/layers/cudnn_relu_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - ReLULayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { - ReLULayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNReLULayer::~CuDNNReLULayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp deleted file mode 100644 index 32637873..00000000 --- a/src/caffe/layers/cudnn_sigmoid_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSigmoidLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSigmoidLayer::~CuDNNSigmoidLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp deleted file mode 100644 index 77a3225a..00000000 --- a/src/caffe/layers/cudnn_softmax_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::LayerSetUp(bottom, top); - // Initialize CUDNN. - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::Reshape(bottom, top); - int N = this->outer_num_; - int K = bottom[0]->shape(this->softmax_axis_); - int H = this->inner_num_; - int W = 1; - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSoftmaxLayer::~CuDNNSoftmaxLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp deleted file mode 100644 index 376faad3..00000000 --- a/src/caffe/layers/cudnn_tanh_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - TanHLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNTanHLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - TanHLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNTanHLayer::~CuDNNTanHLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu deleted file mode 100644 index bb310e1a..00000000 --- a/src/caffe/layers/cufiles/absval_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void AbsValLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); -} - -template -void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu deleted file mode 100644 index 9335a5bc..00000000 --- a/src/caffe/layers/cufiles/base_data_layer.cu +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#include "caffe/data_layers.hpp" - -namespace caffe { - -template -void BasePrefetchingDataLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, join the thread - JoinPrefetchThread(); - // Reshape to loaded data. - top[0]->ReshapeLike(this->prefetch_data_); - // Copy the data - caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_gpu_data()); - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - // Copy the labels. - caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_gpu_data()); - } - // Start a new prefetch thread - CreatePrefetchThread(); -} - -INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu deleted file mode 100644 index d963d068..00000000 --- a/src/caffe/layers/cufiles/bnll_layer.cu +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -const float kBNLL_THRESHOLD = 50.; - -template -__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? - in[index] + log(1. + exp(-in[index])) : - log(1. + exp(in[index])); - } -} - -template -void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void BNLLBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD))); - out_diff[index] = in_diff[index] * expval / (expval + 1.); - } -} - -template -void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLBackward<<>>( - count, top_diff, bottom_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu deleted file mode 100644 index 8f2e85d8..00000000 --- a/src/caffe/layers/cufiles/concat_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Concat(const int nthreads, const Dtype* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } -} - -template -void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = true; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); - offset_concat_axis += bottom_concat_axis; - } -} - -template -void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = false; - for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); - offset_concat_axis += bottom_concat_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu deleted file mode 100644 index 93123931..00000000 --- a/src/caffe/layers/cufiles/contrastive_loss_layer.cu +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), - Dtype(0.0)); - loss += dist*dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -__global__ void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) { - CUDA_KERNEL_LOOP(i, count) { - int n = i / channels; // the num index, to access y and dist_sq - if (static_cast(y[n])) { // similar pairs - bottom_diff[i] = alpha * diff[i]; - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = (margin - dist_sq[n]); - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq[n]); - mdist = (margin - dist); - beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; - } - if (mdist > 0.0) { - bottom_diff[i] = beta; - } else { - bottom_diff[i] = 0; - } - } - } -} - -template -void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const int count = bottom[0]->count(); - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); - // NOLINT_NEXT_LINE(whitespace/operators) - CLLBackward<<>>( - count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu deleted file mode 100644 index b8a98ff7..00000000 --- a/src/caffe/layers/cufiles/conv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu deleted file mode 100644 index b4e802e1..00000000 --- a/src/caffe/layers/cufiles/cudnn_conv_layer.cu +++ /dev/null @@ -1,160 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -__global__ void sync_conv_groups() { } - -template -void CuDNNConvolutionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - - size_t workspace_limit_bytes = this->kernel_h_ * - this->kernel_w_ * - this->channels_ * - sizeof(int) + 1; - - // Forward through cuDNN in parallel over groups. - for (int g = 0; g < this->group_; g++) { - cudnnConvolutionFwdAlgo_t algo; - - // pick the convolution algorithm - // TODO(shelhamer) this should be done during reshape - // TODO(shelhamer) the choice of automatic or manual algorithm picking - // should be exposed in proto - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, // memoryLimitInBytes, - &algo)); - - // get minimum size of the workspace needed for the desired algorithm - size_t workspaceSizeInBytes_temp = 0; - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - algo, - &workspaceSizeInBytes_temp)); - - if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { - workspaceSizeInBytes = workspaceSizeInBytes_temp; - // free the existing workspace and allocate a new (larger) one - cudaFree(this->workspace); - cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); - if (err != cudaSuccess) { - // force zero memory path - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - workspace = NULL; - workspaceSizeInBytes = 0; - } - } - - // Filters. - CUDNN_CHECK(cudnnConvolutionForward(handle_[g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - filter_desc_, weight + weight_offset_ * g, - conv_descs_[i], - algo, workspace, workspaceSizeInBytes, - cudnn::dataType::zero, - top_descs_[i], top_data + top_offset_ * g)); - - // Bias. - if (this->bias_term_) { - const Dtype* bias_data = this->blobs_[1]->gpu_data(); - CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C, - cudnn::dataType::one, - bias_desc_, bias_data + bias_offset_ * g, - cudnn::dataType::one, - top_descs_[i], top_data + top_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -template -void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = NULL; - Dtype* weight_diff = NULL; - if (this->param_propagate_down_[0]) { - weight = this->blobs_[0]->gpu_data(); - weight_diff = this->blobs_[0]->mutable_gpu_diff(); - } - Dtype* bias_diff = NULL; - if (this->bias_term_ && this->param_propagate_down_[1]) { - bias_diff = this->blobs_[1]->mutable_gpu_diff(); - } - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Backward through cuDNN in parallel over groups and gradients. - for (int g = 0; g < this->group_; g++) { - // Gradient w.r.t. bias. - if (this->bias_term_ && this->param_propagate_down_[1]) { - CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g], - cudnn::dataType::one, - top_descs_[i], top_diff + top_offset_ * g, - cudnn::dataType::one, - bias_desc_, bias_diff + bias_offset_ * g)); - } - - // Gradient w.r.t. weights. - if (this->param_propagate_down_[0]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::one, - filter_desc_, weight_diff + weight_offset_ * g)); - } - - // Gradient w.r.t. bottom data. - if (propagate_down[i]) { - if (weight == NULL) { - weight = this->blobs_[0]->gpu_data(); - } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g], - cudnn::dataType::one, - filter_desc_, weight + weight_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::zero, - bottom_descs_[i], bottom_diff + bottom_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu deleted file mode 100644 index a952b855..00000000 --- a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu +++ /dev/null @@ -1,45 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNPoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu deleted file mode 100644 index 21d14857..00000000 --- a/src/caffe/layers/cufiles/cudnn_relu_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Forward_gpu(bottom, top); - } - - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Backward_gpu(top, propagate_down, bottom); - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu deleted file mode 100644 index 7a06cf72..00000000 --- a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu +++ /dev/null @@ -1,47 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNSigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu deleted file mode 100644 index a9e2fcef..00000000 --- a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNSoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu deleted file mode 100644 index d287f6fe..00000000 --- a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNTanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu deleted file mode 100644 index 39bc4de8..00000000 --- a/src/caffe/layers/cufiles/deconv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu deleted file mode 100644 index f9ea04f4..00000000 --- a/src/caffe/layers/cufiles/dropout_layer.cu +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/syncedmem.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - - -template -__global__ void DropoutForward(const int n, const Dtype* in, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] * (mask[index] > threshold) * scale; - } -} - -template -void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - unsigned int* mask = - static_cast(rand_vec_.mutable_gpu_data()); - caffe_gpu_rng_uniform(count, mask); - // set thresholds - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutForward<<>>( - count, bottom_data, mask, uint_thres_, scale_, top_data); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(count, bottom_data, top_data); - } -} - -template -__global__ void DropoutBackward(const int n, const Dtype* in_diff, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); - } -} - -template -void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const unsigned int* mask = - static_cast(rand_vec_.gpu_data()); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutBackward<<>>( - count, top_diff, mask, uint_thres_, scale_, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(top[0]->count(), top_diff, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu deleted file mode 100644 index 2247870d..00000000 --- a/src/caffe/layers/cufiles/eltwise_layer.cu +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype maxval = -FLT_MAX; - int maxidx = -1; - if (bottom_data_a[index] > bottom_data_b[index]) { - // only update for very first bottom_data blob (blob_idx == 0) - if (blob_idx == 0) { - maxval = bottom_data_a[index]; - top_data[index] = maxval; - maxidx = blob_idx; - mask[index] = maxidx; - } - } else { - maxval = bottom_data_b[index]; - top_data[index] = maxval; - maxidx = blob_idx + 1; - mask[index] = maxidx; - } - } -} - -template -void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int* mask = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.mutable_gpu_data(); - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward <<>>( - count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); - for (int i = 2; i < bottom.size(); ++i) { - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward<<>>( - count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } -} - -template -__global__ void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype gradient = 0; - if (mask[index] == blob_idx) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } -} - -template -void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } - if (!initialized) { - caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); - initialized = true; - } else { - caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - } - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.gpu_data(); - MaxBackward // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - count, top_diff, i, mask, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu deleted file mode 100644 index 5b1de3ad..00000000 --- a/src/caffe/layers/cufiles/euclidean_loss_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu deleted file mode 100644 index 2d75d8dd..00000000 --- a/src/caffe/layers/cufiles/exp_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } -} - -template -void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu deleted file mode 100644 index cf929eee..00000000 --- a/src/caffe/layers/cufiles/filter_layer.cu +++ /dev/null @@ -1,70 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->gpu_data(); - Dtype* top_data = top[t]->mutable_gpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * dim; - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } -} - -template -void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); ++i) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu deleted file mode 100644 index 5e3e4ced..00000000 --- a/src/caffe/layers/cufiles/hdf5_data_layer.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* -TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" -*/ - -#include -#include -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/data_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" - -namespace caffe { - -template -void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - current_file_ += 1; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu deleted file mode 100644 index ae497c34..00000000 --- a/src/caffe/layers/cufiles/hdf5_output_layer.cu +++ /dev/null @@ -1,43 +0,0 @@ -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - - for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); - } - SaveBlobs(); -} - -template -void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu deleted file mode 100644 index 9c338b14..00000000 --- a/src/caffe/layers/cufiles/im2col_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); - } -} - -template -void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu deleted file mode 100644 index d93560a0..00000000 --- a/src/caffe/layers/cufiles/inner_product_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., - bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); - if (bias_term_) { - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.gpu_data(),0, - this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); - } -} - -template -void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm_ex(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - caffe_gpu_gemvv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, - (size_t)0, N_, reinterpret_cast(bias_multiplier_->gpu_data()), - (size_t)0, (Dtype)0., 1, - this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., - bottom[0]->mutable_gpu_diff(), 0); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu deleted file mode 100644 index 847c86cd..00000000 --- a/src/caffe/layers/cufiles/log_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/neuron_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } -} - -template -void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(LogLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu deleted file mode 100644 index 001b3c34..00000000 --- a/src/caffe/layers/cufiles/lrn_layer.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const in_off = in + offset; - Dtype* const scale_off = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } -} - - -template -void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_gpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -// TODO: check if it would be faster to just put it into the previous kernel. -template -__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in, - const Dtype* const scale, const Dtype negative_beta, Dtype* const out) { - CUDA_KERNEL_LOOP(index, nthreads) { - out[index] = in[index] * pow(scale[index], negative_beta); - } -} - -template -void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale<<>>( - n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - CUDA_POST_KERNEL_CHECK; - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput<<>>( - n_threads, bottom_data, scale_data, -beta_, top_data); - CUDA_POST_KERNEL_CHECK; -} -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); - - -template -void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_gpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -template -__global__ void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const bottom_off = bottom_data + offset; - const Dtype* const top_off = top_data + offset; - const Dtype* const scale_off = scale + offset; - const Dtype* const top_diff_off = top_diff + offset; - Dtype* const bottom_diff_off = bottom_diff + offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - } -} - -template -void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff<<>>( - n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); -} -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); - - - -INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu deleted file mode 100644 index 3888a0c7..00000000 --- a/src/caffe/layers/cufiles/mvn_layer.cu +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) - caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 - caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - } -} - -template -void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_copy(temp_.count(), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu deleted file mode 100644 index ca4b13f7..00000000 --- a/src/caffe/layers/cufiles/pooling_layer.cu +++ /dev/null @@ -1,387 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxPoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data, int* mask, Dtype* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - Dtype maxval = -FLT_MAX; - int maxidx = -1; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_slice[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_slice[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } -} - -template -__global__ void AvePoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - Dtype aveval = 0; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } -} - -template -__global__ void StoPoolForwardTrain(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const rand_idx, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - Dtype cumsum = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_slice[h * width + w]; - return; - } - } - } - } -} - - -template -__global__ void StoPoolForwardTest(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems - Dtype cumsum = FLT_MIN; - Dtype cumvalues = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum; - } -} - - -template -void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; - Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->mutable_gpu_data(); - } else { - mask = max_idx_.mutable_gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - if (this->phase_ == TRAIN) { - // We need to create the random index as well. - caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); - } else { - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); - } - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -template -__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - Dtype gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = top_diff + offset; - if (mask) { - const int* const mask_slice = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } else { - const Dtype* const top_mask_slice = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } -} - -template -__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width + pad_w; - const int h = (index / width) % height + pad_h; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } -} - - -template -__global__ void StoPoolBackward(const int nthreads, - const Dtype* const rand_idx, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const rand_idx_slice = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff_slice[ph * pooled_width + pw] * - (index == static_cast(rand_idx_slice[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; - } -} - - -template -void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - caffe_gpu_set(count, Dtype(0.), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->gpu_data(); - } else { - mask = max_idx_.gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward<<>>( - count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward<<>>( - count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward<<>>( - count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu deleted file mode 100644 index 90d94405..00000000 --- a/src/caffe/layers/cufiles/power_layer.cu +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } -} - -template -void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu deleted file mode 100644 index e1f20048..00000000 --- a/src/caffe/layers/cufiles/prelu_layer.cu +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// CUDA kernele for forward -template -__global__ void PReLUForward(const int n, const int channels, const int dim, - const Dtype* in, Dtype* out, const Dtype* slope_data, - const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; - } -} - -// CUDA kernel for bottom backward -template -__global__ void PReLUBackward(const int n, const int channels, const int dim, - const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff, - const Dtype* slope_data, const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); - } -} - -// CUDA kernel for element-wise parameter backward -template -__global__ void PReLUParamBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); - } -} - -template -void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - const int div_factor = channel_shared_ ? channels : 1; - - // For in-place computation - if (top[0] == bottom[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); - } - - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUForward<<>>( - count, channels, dim, bottom_data, top_data, slope_data, div_factor); - CUDA_POST_KERNEL_CHECK; -} - -template -void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - // For in-place computation - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.gpu_data(); - } - - // Propagate to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); - int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward<<>>( - cdim, top_diff + top[0]->offset(n), - bottom_data + bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - if (channel_shared_) { - Dtype d; - caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } - if (channel_shared_) { - caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - int div_factor = channel_shared_ ? channels : 1; - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUBackward<<>>( - count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data, - div_factor); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu deleted file mode 100644 index 2dbd3bc9..00000000 --- a/src/caffe/layers/cufiles/reduction_layer.cu +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ReductionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } -} - -template -void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->gpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu deleted file mode 100644 index b8924c85..00000000 --- a/src/caffe/layers/cufiles/relu_layer.cu +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out, - Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; - } -} - -template -void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUForward<<>>( - count, bottom_data, top_data, negative_slope); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void ReLUBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * negative_slope); - } -} - -template -void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUBackward<<>>( - count, top_diff, bottom_data, bottom_diff, negative_slope); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu deleted file mode 100644 index 547fa80c..00000000 --- a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = bottom[1]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu deleted file mode 100644 index e1af0657..00000000 --- a/src/caffe/layers/cufiles/sigmoid_layer.cu +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = 1. / (1. + exp(-in[index])); - } -} - -template -void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void SigmoidBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - const Dtype sigmoid_x = out_data[index]; - out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); - } -} - -template -void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu deleted file mode 100644 index 8d044ee7..00000000 --- a/src/caffe/layers/cufiles/silence_layer.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Do nothing. -} - -template -void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu deleted file mode 100644 index 796841d3..00000000 --- a/src/caffe/layers/cufiles/slice_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Slice(const int nthreads, const Dtype* in_data, - const bool forward, const int num_slices, const int slice_size, - const int bottom_slice_axis, const int top_slice_axis, - const int offset_slice_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_slice_size = slice_size * top_slice_axis; - const int slice_num = index / total_slice_size; - const int slice_index = index % total_slice_size; - const int bottom_index = slice_index + - (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; - if (forward) { - out_data[index] = in_data[bottom_index]; - } else { - out_data[bottom_index] = in_data[index]; - } - } -} - -template -void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int offset_slice_axis = 0; - const Dtype* bottom_data = bottom[0]->gpu_data(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = true; - for (int i = 0; i < top.size(); ++i) { - Dtype* top_data = top[i]->mutable_gpu_data(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data); - offset_slice_axis += top_slice_axis; - } -} - -template -void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - int offset_slice_axis = 0; - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = false; - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff); - offset_slice_axis += top_slice_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu deleted file mode 100644 index 1f9c3a41..00000000 --- a/src/caffe/layers/cufiles/softmax_layer.cu +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } -} - -template -__global__ void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, count) { - out[index] = exp(data[index]); - } -} - -template -__global__ void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } -} - -template -__global__ void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } -} - -template -void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = bottom[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - // compute max - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_max<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // subtract - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); - // exponentiate - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp<<>>( - count, top_data, top_data); - // sum after exp - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // divide - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); -} - -template -void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = top[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, top_diff, bottom_diff); - // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_dot<<>>(outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); - // elementwise multiplication - caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu deleted file mode 100644 index 7e0f3da4..00000000 --- a/src/caffe/layers/cufiles/softmax_loss_layer.cu +++ /dev/null @@ -1,125 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - Dtype(FLT_MIN))); - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is not used for anything until it is overwritten - // on the backward pass, we use it here to avoid having to allocate new GPU - // memory to accumulate intermediate results in the kernel. - Dtype* loss_data = bottom[0]->mutable_gpu_diff(); - // Similarly, this memory is never used elsewhere, and thus we can use it - // to avoid having to allocate additional GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU<<>>(nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - Dtype loss; - caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; - } - top[0]->mutable_cpu_data()[0] = loss; - if (top.size() == 2) { - top[1]->ShareData(prob_); - } -} - -template -__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) { - const int channels = dim / spatial_dim; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is never used for anything else, - // we use to to avoid allocating new GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU<<>>(nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu deleted file mode 100644 index a4f5df26..00000000 --- a/src/caffe/layers/cufiles/split_layer.cu +++ /dev/null @@ -1,38 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } -} - -template -void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - if (top.size() == 1) { - caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); - return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu deleted file mode 100644 index ccd6e63e..00000000 --- a/src/caffe/layers/cufiles/tanh_layer.cu +++ /dev/null @@ -1,59 +0,0 @@ -// TanH neuron activation function layer. -// Adapted from ReLU layer code written by Yangqing Jia - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = tanh(in[index]); - } -} - -template -void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void TanHBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype tanhx = out_data[index]; - out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); - } -} - -template -void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu deleted file mode 100644 index bfa7f159..00000000 --- a/src/caffe/layers/cufiles/threshold_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ThresholdForward(const int n, const Dtype threshold, - const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > threshold ? 1 : 0; - } -} - -template -void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - ThresholdForward<<>>( - count, threshold_, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer); - - -} // namespace caffe From b6b96a7471e7b9d1db132044c421bf1452e4d314 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 2 Sep 2015 06:12:20 +0800 Subject: [PATCH 046/124] Removed forward_opt and backward_opt functions in conv layer --- include/caffe/vision_layers.hpp | 8 -- src/caffe/layers/base_conv_layer.cpp | 137 --------------------------- src/caffe/layers/conv_layer.cpp | 13 --- 3 files changed, 158 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 2f2d7eef..3ee5a779 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -140,10 +140,6 @@ class BaseConvolutionLayer : public Layer { //opencl related data structures protected: - void forward_gpu_opt(const vector*>& bottom, const Dtype* weight, - const vector*>& top, bool skip_im2col = false) ; - void backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); int opt_num2; int M_, N_, K_; int weight_offset_; @@ -223,12 +219,8 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt(const vector*>& bottom, - const vector*>& top); virtual void Forward_gpu_opt2(const vector*>& bottom, const vector*>& top); - virtual void Backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 19458185..fc541ef9 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -448,143 +448,6 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, bias, (size_t)0, 1); } - -template -void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bottom, const Dtype* weight, const vector*>& top, bool skip_im2col){ - - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - int M_org = M_ * group_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - int weight_offset = M_ * K_; - int opt_num2 = global_packing_N; - cl_command_queue Queue; - cl_event prof_event; - for (int n = 0; n < num_; n += opt_num2) { - opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - top_offset = M_ * N_ * opt_num2; - col_offset = K_ * N_ * opt_num2; - im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - -#ifdef multiQ - for (int g = 0; g < group_; ++g) { - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, - (Dtype)0., (Dtype*)subTopMem, top_offset * g); - } - if(group_ == 2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } -#else - Queue = amdDevice.CommandQueue; - for (int g = 0; g < group_; ++g) { - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, - (Dtype)0., (Dtype*)subTopMem, top_offset * g); - } -#endif - transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); - - for (int z = 0; z < opt_num2; z++) - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z); - } - } -} -} - -template -void BaseConvolutionLayer::backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); - for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, num_output_, N_, - (Dtype)1., top_diff, top[i]->offset(n), N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, - bias_diff, (size_t)0, 1); - } - } - - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - int col_offset = K_ * N_; - int top_offset = M_ * N_; - int weight_offset = M_ * K_; - int opt_num2 = global_packing_N; - int g = 0; - cl_command_queue Queue; - cl_event prof_event; - - for (int n = 0; n < num_; n += opt_num2) { - opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - top_offset = M_ * (N_ * opt_num2); - col_offset = K_ * (N_ * opt_num2); - im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - - int height_top = M_ * group_, width_top = N_; - opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); - - for(g = 0; g < group_; ++g) { -#ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; -#else - Queue = amdDevice.CommandQueue; -#endif - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, - (Dtype)1., (Dtype*)subTopMem, top_offset * g, - (Dtype*)transMem, col_offset * g, (Dtype)1., - (Dtype*)weight_diff, weight_offset * g); - } - - if (propagate_down[i]) { - for (g = 0; g < group_; ++g) { -#ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; -#else - Queue = amdDevice.CommandQueue; -#endif - prof_event = caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_, - (Dtype)1., weight, weight_offset * g, - (Dtype*)subTopMem, top_offset * g, - (Dtype)0., (Dtype*)transMem, col_offset * g); - } - } - -#ifdef multiQ - if(group_ ==2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } -#endif - col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); - } - } - } -} - #endif // !CPU_ONLY INSTANTIATE_CLASS(BaseConvolutionLayer); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 020098aa..c829dbd7 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -87,13 +87,6 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, Backward_gpu_org(top, propagate_down, bottom); } -template -void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - this->forward_gpu_opt(bottom, weight, top); -} - template void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, const vector*>& top) { @@ -153,12 +146,6 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom //CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template -void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - this->backward_gpu_opt(top, propagate_down, bottom); -} - template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { From 20142c4a4bf5f74c37a5253468f2b8de33b4e5d0 Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 4 Sep 2015 02:11:04 +0800 Subject: [PATCH 047/124] Enable SetDevice function; clean the code in device.cpp --- include/caffe/common.hpp | 1 - include/caffe/device.hpp | 10 +- src/caffe/common.cpp | 119 +--------------- src/caffe/device.cpp | 242 +++++++++----------------------- src/caffe/layers/conv_layer.cpp | 29 ++-- 5 files changed, 95 insertions(+), 306 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index b93e0d6d..ac954a0e 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -233,7 +233,6 @@ class Caffe { // it personally but better to note it here in the header file. inline static void set_mode(Brew mode) { Get().mode_ = mode; - amdDevice.Init(); } // Sets the random seed of both boost and curand static void set_random_seed(const unsigned int seed); diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 31adcb5f..697e2391 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -34,7 +34,7 @@ namespace caffe { class Device{ public: - Device():numPlatforms(0),numDevices(0){ } + Device():numPlatforms(0),numDevices(0),device_id(INT_MIN){} ~Device(); cl_uint numPlatforms; cl_platform_id * platformIDs; @@ -42,22 +42,26 @@ class Device{ char openclVersion[64]; cl_uint numDevices; cl_device_id * DeviceIDs; + cl_context Context; cl_command_queue CommandQueue; cl_command_queue CommandQueue_helper; cl_program Program; cl_device_id * pDevices; + int device_id; + clblasOrder col; clblasOrder row; std::map Kernels; - - cl_int Init(); + + cl_int Init(int device_id = -1); cl_int ConvertToString(std::string pFileName,std::string &Str); void DisplayPlatformInfo(); void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); void GetDeviceInfo(); void DeviceQuery(); + int GetDevice(){return device_id;}; void BuildProgram(std::string kernel_dir); template diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 3891852a..83afe272 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -108,10 +108,12 @@ Caffe::Caffe() LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } */ +#ifndef CPU_ONLY cl_int err = clblasSetup(); if(err != CL_SUCCESS){ LOG(ERROR) << "clBLAS setup failed "<(generator_->rng()); } -//const char* cublasGetErrorString(cublasStatus_t error) { - /* switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; -#if CUDA_VERSION >= 6000 - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; -#endif -#if CUDA_VERSION >= 6050 - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; -#endif - } -*/ -// return "Unknown cublas status"; -//} - -//const char* curandGetErrorString(curandStatus_t error) { - /*switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - } -*/ - // return "Unknown curand status"; -//} - #endif // CPU_ONLY } // namespace caffe diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 7a866c11..dc47e907 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -37,25 +37,25 @@ namespace caffe { cl_mem test_alloc_mem[10]; extern long long unsigned device_mem_consumption; -Device amdDevice; char* buildOption = "-x clc++ "; //char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64"; -std::string oclKernelPath="./src/caffe/ocl/"; +std::string oclKernelPath = "./src/caffe/ocl/"; +Device amdDevice; Device::~Device(){ //clAmdBlasTeardown(); ReleaseKernels(); free((void*)platformIDs); - free(DeviceIDs); - clReleaseProgram(Program); - clReleaseCommandQueue(CommandQueue); - clReleaseCommandQueue(CommandQueue_helper); - clReleaseContext(Context); - LOG(INFO) << "device destructor"; + free(DeviceIDs); + clReleaseProgram(Program); + clReleaseCommandQueue(CommandQueue); + clReleaseCommandQueue(CommandQueue_helper); + clReleaseContext(Context); + LOG(INFO) << "device destructor"; } -cl_int Device::Init(){ +cl_int Device::Init(int deviceId){ //Get Platform Infomation DisplayPlatformInfo(); @@ -75,63 +75,36 @@ cl_int Device::Init(){ GetDeviceInfo(); cl_uint uiNumDevices; cl_bool unified_memory = false; -/* switch(Caffe::mode()) { - case Caffe::GPU: - //choose_gpu(); - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - uiNumDevices = numDevices; - if(0 == uiNumDevices){ + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if(0 == uiNumDevices){ LOG(FATAL) << "Err: No GPU devices"; - } - else{ + } else { pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices)); - for (int i = 0; i < (int)uiNumDevices; i++){ - clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); - if(unified_memory) //skip iGPU - continue; - else {//we pick the first GPU we found - pDevices[0] = pDevices[i]; + if (deviceId == -1) { + int i; + for (i = 0; i < (int)uiNumDevices; i++){ + clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); + if(!unified_memory) { //skip iGPU + //we pick the first dGPU we found + pDevices[0] = pDevices[i]; + device_id = i; + LOG(INFO) << "Picked default device type : dGPU "<=0 && deviceId < uiNumDevices){ + pDevices[0] = pDevices[deviceId]; + device_id = deviceId; + LOG(INFO) << "Picked device type : GPU "< global_mem_size_limit){ - long long size_; - if((available_global_mem_size - global_mem_size_limit) >= global_mem_malloc_size_limit){ - size_ = global_mem_malloc_size_limit; - }else{ - size_ = available_global_mem_size - global_mem_size_limit; - } - available_global_mem_size = available_global_mem_size - size_; - int *tmpData = (int *)malloc(size_); - cl_int err; - int i = 0; - test_alloc_mem[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, size_, NULL, &err); - err = clEnqueueWriteBuffer(CommandQueue, test_alloc_mem[i], CL_TRUE, 0, size_, tmpData, 0, NULL, NULL); - i++; - device_mem_consumption += size_; - //printf("self alloc, device_mem_consumption = %lu\n", device_mem_consumption); - if(err != CL_SUCCESS) { - printf("Large Buffer Allocation failed! error_code = %d\n", err); - printf("self alloc, device_mem_consumption = %llu\n", device_mem_consumption); - exit(1); - } - - cl_ulong free_mem_size, mem_size; - cl_int err1 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,sizeof(cl_ulong),&free_mem_size,NULL); - cl_int err2 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&mem_size,NULL); - //std::cout<<"free memory size after allocation = "<::iterator it = Kernels.find(kernel_name); - if(it == Kernels.end()) + if (it == Kernels.end()) { cl_int _err=0; cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err); @@ -320,7 +217,7 @@ cl_kernel Device::GetKernel(std::string kernel_name) void Device::ReleaseKernels() { std::map::iterator it; - for(it = Kernels.begin(); it != Kernels.end(); it++) + for (it = Kernels.begin(); it != Kernels.end(); it++) { clReleaseKernel(it->second); } @@ -331,7 +228,7 @@ void Device::DisplayPlatformInfo(){ size_t size; err = clGetPlatformIDs (0, NULL, &numPlatforms); - if(err != CL_SUCCESS || numPlatforms <=0) + if (err != CL_SUCCESS || numPlatforms <=0) { LOG(ERROR) << "Failed to find any OpenCL platform."; return; @@ -349,11 +246,11 @@ void Device::DisplayPlatformInfo(){ //iterate through the list of platforms displaying platform information for (cl_uint i = 0; i < numPlatforms; i++ ){ - DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); - DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); - DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS"); + DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); + DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); + DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS"); } } @@ -388,37 +285,37 @@ void Device::GetDeviceInfo(){ // we allow program run if no GPU is found. Just return. No error reported. if (numDevices < 1) { - LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; - LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; - return; + LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; + LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; + return; } DeviceIDs = (cl_device_id *) malloc (sizeof(cl_device_id) * numDevices); err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, DeviceIDs, NULL); - if(err != CL_SUCCESS) + if (err != CL_SUCCESS) { - LOG(INFO) << "Failed to find any GPU devices."; - return; + LOG(INFO) << "Failed to find any GPU devices."; + return; } LOG(INFO) << "Number of devices found:" << numDevices; - for(cl_uint i = 0; i < numDevices; i++){ - LOG(INFO) << "\t" << "DeviceID" << ":\t" <(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + for (cl_uint i = 0; i < numDevices; i++) { + LOG(INFO) << "\t" << "DeviceID" << ":\t" <(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); } @@ -435,7 +332,7 @@ void Device::DeviceQuery() size_t nameLen; cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); - if(res != CL_SUCCESS){ + if (res != CL_SUCCESS) { fprintf(stderr, "Err: Failed to Get Platform Info\n", res); return; } @@ -466,8 +363,7 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string } - switch(name) -{ + switch(name){ case CL_DEVICE_TYPE: { std::string deviceType; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 8f7d8f82..4a85dd74 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -68,19 +68,19 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff"); - + // CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); +// CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff[0]"); + // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt(bottom, top); + Forward_gpu_opt(bottom, top); else - Forward_gpu_org(bottom, top); -// CHECK_BLOB_DATA(top[0],20, "top[0]"); + Forward_gpu_org(bottom, top); + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -90,6 +90,12 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, Backward_gpu_opt(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); +// CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); + // CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); +// CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); + // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); + + } template @@ -131,9 +137,6 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto } } - CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - CHECK_BLOB_DATA(top[0],20, "top[0]"); - } template @@ -159,8 +162,6 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } } - // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -212,12 +213,8 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, } } } - - CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); - CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } + template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { From b804a1d9313c7c66c5b9955f561858a717308fea Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 14:50:21 +0800 Subject: [PATCH 048/124] Fixed conv layers opt2 bug --- include/caffe/common.hpp | 2 +- include/caffe/vision_layers.hpp | 8 ++- src/caffe/layers/base_conv_layer.cpp | 101 ++++++++------------------- src/caffe/layers/conv_layer.cpp | 43 ++++-------- 4 files changed, 49 insertions(+), 105 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index ac954a0e..c5bf909d 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -81,7 +81,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 237e9cbf..2f2d7eef 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -105,6 +105,7 @@ class BaseConvolutionLayer : public Layer { col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); } + protected: inline void conv_im2col_gpu_opt(const Dtype* data) { im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); @@ -113,11 +114,12 @@ class BaseConvolutionLayer : public Layer { col2im_gpu_opt((Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); } + private: inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { - transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2); + transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2); } inline void conv_transpose_gpu(const Dtype* data){ - opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); + opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); } protected: inline void gpu_memset(Dtype* data, Dtype value, int count) { @@ -147,7 +149,7 @@ class BaseConvolutionLayer : public Layer { int weight_offset_; int col_offset_; int output_offset_; - int top_offset_, top_offset_n, bottom_offset_; + int top_offset_, top_offset_opt, bottom_offset_; public: static cl_mem subTopMem, transMem; static size_t subtop_mem_size, trans_mem_size; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 1c1379b3..faa7b63c 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -33,19 +33,9 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) template void BaseConvolutionLayer::ocl_setup() { -/* im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL); - col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL); - oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL); - im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL); - col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL); - opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL); - ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL); - ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL); -*/ - M_ = conv_out_channels_ / group_; - K_ = kernel_dim_ / group_; - N_ = conv_out_spatial_dim_; - + M_ = num_output_ / group_; + K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; + N_ = height_out_ * width_out_; #ifdef use_packing_scheme size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); @@ -56,15 +46,6 @@ void BaseConvolutionLayer::ocl_setup() { template BaseConvolutionLayer::~BaseConvolutionLayer(){ - /* - OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) ); - OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) ); - OCL_CHECK( clReleaseKernel(oclmem_kernel) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) ); - OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) ); - OCL_CHECK( clReleaseKernel(im2col_opt_kernel) ); - OCL_CHECK( clReleaseKernel(col2im_opt_kernel) ); -*/ } @@ -314,9 +295,10 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - conv_im2col_gpu_opt(input); + //conv_im2col_gpu_opt(input); + im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, + (Dtype*)transMem, 0, opt_num2); } - //col_buff = col_buffer_.gpu_data(); } #ifdef multiQ for (int g = 0; g < group_; ++g) { @@ -324,7 +306,7 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, else Queue = amdDevice.CommandQueue_helper; prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } if(group_ == 2){ clFinish(amdDevice.CommandQueue); @@ -335,10 +317,11 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, for (int g = 0; g < group_; ++g) { prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_ * g); + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } #endif - conv_transform_gpu((Dtype*)subTopMem, output); + //conv_transform_gpu((Dtype*)subTopMem, output); + transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2); } @@ -358,7 +341,7 @@ void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., output, top_offset_n + num_output_ * N_ * z); + (Dtype)1., output, top_offset_ + num_output_ * N_ * z); } template @@ -371,7 +354,7 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, for (int g = 0; g < group_; ++g) { caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights, weight_offset_ * g, - output, top_offset_+output_offset_ * g, + output, top_offset_ + output_offset_ * g, (Dtype)0., col_buff, col_offset_ * g); } if (!is_1x1_) { @@ -382,7 +365,6 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, const Dtype* weights, Dtype* input) { - //Dtype* col_buff = col_buffer_.mutable_gpu_data(); cl_command_queue Queue; if (is_1x1_) { int count = height_ * width_ * conv_in_channels_ * opt_num2; @@ -395,9 +377,9 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #else Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_, + caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype)1., weights, weight_offset_ * g, - (Dtype*)subTopMem, top_offset_ * g, + (Dtype*)subTopMem, top_offset_opt * g, (Dtype)0., (Dtype*)transMem, col_offset_ * g); } #ifdef multiQ @@ -408,8 +390,10 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #endif if (!is_1x1_) { - conv_col2im_gpu_opt(input); - } + //conv_col2im_gpu_opt(input); + col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + stride_w_, input, bottom_offset_, opt_num2); + } } template @@ -433,10 +417,14 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* col_buff = input; cl_command_queue Queue; if (!is_1x1_) { - conv_im2col_gpu_opt(input); - //col_buff = col_buffer_.gpu_data(); + //conv_im2col_gpu_opt(input); + im2col_gpu_opt(input, bottom_offset_, channels_, height_, + width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } - conv_transpose_gpu(output); + //conv_transpose_gpu(output); + int height_top = M_ * group_, width_top = N_; + opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); + for (int g = 0; g < group_; ++g) { #ifdef multiQ @@ -445,8 +433,8 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, #else Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, - (Dtype)1., (Dtype*)subTopMem, top_offset_ * g, + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g, (Dtype*)transMem, col_offset_ * g, (Dtype)1., (Dtype*)weights, weight_offset_ * g); #ifdef multiQ @@ -461,10 +449,8 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input) { - /* caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.gpu_data(), 1., bias);*/ - caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_*width_out_, - (Dtype)1., input, top_offset_, height_out_*width_out_, + caffe_gpu_gemv(CblasNoTrans, num_output_, N_, + (Dtype)1., input, top_offset_, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias, (size_t)0, 1); } @@ -475,12 +461,9 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); Dtype* top_data = top[i]->mutable_gpu_data(); Dtype* col_data = col_buffer_.mutable_gpu_data(); - /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. - N' is the M in sgemm call.*/ int M_org = M_ * group_; int col_offset = K_ * N_; int top_offset = M_ * N_; @@ -488,19 +471,13 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo int opt_num2 = global_packing_N; cl_command_queue Queue; cl_event prof_event; - //LOG(INFO) << "conv_fp optimized scheme"; for (int n = 0; n < num_; n += opt_num2) { opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - /*col_offset is the offset for sgemm, including packing and groups - for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ top_offset = M_ * N_ * opt_num2; col_offset = K_ * N_ * opt_num2; - //step1: packed im2col, col_size = (K_ * group_ ) * N_ - //this should be opt_num2 images packing together. im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - //step 2: sgemm: Top (subTopMem) = weight * col_data #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; @@ -521,10 +498,7 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo (Dtype)0., (Dtype*)subTopMem, top_offset * g); } #endif - //step 3: tranform transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); - //step 4: add bias - /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/ for (int z = 0; z < opt_num2; z++) if (bias_term_) { @@ -551,7 +525,7 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, M_, N_, + caffe_gpu_gemv(CblasNoTrans, num_output_, N_, (Dtype)1., top_diff, top[i]->offset(n), N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, bias_diff, (size_t)0, 1); @@ -570,25 +544,17 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t int g = 0; cl_command_queue Queue; cl_event prof_event; - //LOG(INFO) << "conv_bp optimized scheme"; for (int n = 0; n < num_; n += opt_num2) { opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - /*col_offset is the offset for sgemm, including packing and groups - for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/ top_offset = M_ * (N_ * opt_num2); col_offset = K_ * (N_ * opt_num2); - //step1: packed im2col, col_size = (K_ * group_ ) * N_ - //this should be opt_num2 images packing together. im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize int height_top = M_ * group_, width_top = N_; - //if (opt_num2 >1) opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); - //step 3: sgemm: Top (subTopMem) = weight * col_data for(g = 0; g < group_; ++g) { #ifdef multiQ if(g == 0) Queue = amdDevice.CommandQueue; @@ -602,7 +568,6 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t (Dtype*)weight_diff, weight_offset * g); } - //step4: if (propagate_down[i]) { for (g = 0; g < group_; ++g) { #ifdef multiQ @@ -624,14 +589,8 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t clFinish(amdDevice.CommandQueue_helper); } #endif - - //step5: col2im col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); -#ifdef Track_layer - LOG(WARNING) << "conv bp done"; -#endif - } } } diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 4a85dd74..4f59260a 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -1,5 +1,4 @@ #include - #include "caffe/filler.hpp" #include "caffe/layer.hpp" #include "caffe/util/im2col.hpp" @@ -33,7 +32,7 @@ void ConvolutionLayer::Forward_cpu(const vector*>& bottom, } } -// CHECK_BLOB_DATA(top[0],20, "top[0]"); + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template @@ -67,10 +66,6 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } - //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - // CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff"); -// CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff[0]"); - // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } template @@ -79,8 +74,7 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, if (use_packing_scheme && global_packing_N >1) Forward_gpu_opt(bottom, top); else - Forward_gpu_org(bottom, top); - // CHECK_BLOB_DATA(top[0],20, "top[0]"); + Forward_gpu_org(bottom, top); } template @@ -103,11 +97,6 @@ void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); this->forward_gpu_opt(bottom, weight, top); - -#ifdef Track_layer - LOG(WARNING) << "conv fp done"; -#endif - } template @@ -120,14 +109,14 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto Dtype* top_data = top[i]->mutable_gpu_data(); this->opt_num2 = global_packing_N; + this->weight_offset_ = this->M_ * this->K_; for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; //intermediate variables to pass offset - this->top_offset_ = this->M_ * this->N_ * this->opt_num2; - this->top_offset_n = top[i]->offset(n); + this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; + this->top_offset_ = top[i]->offset(n); this->col_offset_ = this->K_ * this->N_ * this->opt_num2; this->bottom_offset_ = bottom[i]->offset(n); - this->weight_offset_ = this->M_ * this->K_; this->forward_gpu_gemm_opt(bottom_data, weight, top_data); if (this->bias_term_) { @@ -136,7 +125,6 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto } } } - } template @@ -162,8 +150,6 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } } -} - template void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { @@ -181,30 +167,31 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - this->gpu_memset(bias_diff, 0., this->blobs_[1]->count()); + ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < this->num_; ++n) { - // this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); this->backward_gpu_bias(bias_diff, top_diff); } - } + } if (this->param_propagate_down_[0] || propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); this->weight_offset_ = this->M_ * this->K_; this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; ++n) { + for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; - this->top_offset_n = top[i]->offset(n); + this->top_offset_ = top[i]->offset(n); this->bottom_offset_ = bottom[i]->offset(n); this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_ = this->M_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); } + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm_opt(top_diff, weight, @@ -253,10 +240,6 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } } -// CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); -// CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); -// CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); } #ifdef CPU_ONLY From 79e246a971eacf335b1ba08fc3b71af3244c01cb Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 15:29:44 +0800 Subject: [PATCH 049/124] conv clean up --- src/caffe/layers/base_conv_layer.cpp | 23 +++++++++-------------- src/caffe/layers/conv_layer.cpp | 5 +---- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index faa7b63c..6071c49b 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -295,9 +295,9 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { - //conv_im2col_gpu_opt(input); - im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, - (Dtype*)transMem, 0, opt_num2); + conv_im2col_gpu_opt(input); + // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, + // (Dtype*)transMem, 0, opt_num2); } } #ifdef multiQ @@ -390,9 +390,9 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, #endif if (!is_1x1_) { - //conv_col2im_gpu_opt(input); - col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - stride_w_, input, bottom_offset_, opt_num2); + conv_col2im_gpu_opt(input); + // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, + // stride_w_, input, bottom_offset_, opt_num2); } } @@ -414,12 +414,11 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; cl_command_queue Queue; if (!is_1x1_) { - //conv_im2col_gpu_opt(input); - im2col_gpu_opt(input, bottom_offset_, channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); + conv_im2col_gpu_opt(input); + //im2col_gpu_opt(input, bottom_offset_, channels_, height_, + // width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } //conv_transpose_gpu(output); int height_top = M_ * group_, width_top = N_; @@ -462,8 +461,6 @@ void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bo for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* top_data = top[i]->mutable_gpu_data(); - - Dtype* col_data = col_buffer_.mutable_gpu_data(); int M_org = M_ * group_; int col_offset = K_ * N_; int top_offset = M_ * N_; @@ -535,8 +532,6 @@ void BaseConvolutionLayer::backward_gpu_opt(const vector*>& t if (this->param_propagate_down_[0] || propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - Dtype* col_diff = col_buffer_.mutable_gpu_diff(); int col_offset = K_ * N_; int top_offset = M_ * N_; int weight_offset = M_ * K_; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 4f59260a..0c3a1367 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -81,7 +81,7 @@ template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N >1) - Backward_gpu_opt(top, propagate_down, bottom); + Backward_gpu_opt2(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); // CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); @@ -189,9 +189,6 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); } - this->bottom_offset_ = bottom[i]->offset(n); - this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm_opt(top_diff, weight, From 1958793cdb0513337a25b24e273bddf5acad33c2 Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 1 Sep 2015 22:52:23 +0800 Subject: [PATCH 050/124] removed all cuDNN files --- src/caffe/layers/base_conv_layer.cpp | 6 - src/caffe/layers/cudnn_conv_layer.cpp | 130 ------ src/caffe/layers/cudnn_pooling_layer.cpp | 50 --- src/caffe/layers/cudnn_relu_layer.cpp | 46 --- src/caffe/layers/cudnn_sigmoid_layer.cpp | 46 --- src/caffe/layers/cudnn_softmax_layer.cpp | 50 --- src/caffe/layers/cudnn_tanh_layer.cpp | 46 --- src/caffe/layers/cufiles/absval_layer.cu | 33 -- src/caffe/layers/cufiles/base_data_layer.cu | 30 -- src/caffe/layers/cufiles/bnll_layer.cu | 60 --- src/caffe/layers/cufiles/concat_layer.cu | 71 ---- .../layers/cufiles/contrastive_loss_layer.cu | 111 ----- src/caffe/layers/cufiles/conv_layer.cu | 64 --- src/caffe/layers/cufiles/cudnn_conv_layer.cu | 160 -------- .../layers/cufiles/cudnn_pooling_layer.cu | 45 -- src/caffe/layers/cufiles/cudnn_relu_layer.cu | 57 --- .../layers/cufiles/cudnn_sigmoid_layer.cu | 47 --- .../layers/cufiles/cudnn_softmax_layer.cu | 48 --- src/caffe/layers/cufiles/cudnn_tanh_layer.cu | 48 --- src/caffe/layers/cufiles/deconv_layer.cu | 64 --- src/caffe/layers/cufiles/dropout_layer.cu | 77 ---- src/caffe/layers/cufiles/eltwise_layer.cu | 135 ------ .../layers/cufiles/euclidean_loss_layer.cu | 44 -- src/caffe/layers/cufiles/exp_layer.cu | 44 -- src/caffe/layers/cufiles/filter_layer.cu | 70 ---- src/caffe/layers/cufiles/hdf5_data_layer.cu | 53 --- src/caffe/layers/cufiles/hdf5_output_layer.cu | 43 -- src/caffe/layers/cufiles/im2col_layer.cu | 37 -- .../layers/cufiles/inner_product_layer.cu | 57 --- src/caffe/layers/cufiles/log_layer.cu | 57 --- src/caffe/layers/cufiles/lrn_layer.cu | 203 --------- src/caffe/layers/cufiles/mvn_layer.cu | 124 ------ src/caffe/layers/cufiles/pooling_layer.cu | 387 ------------------ src/caffe/layers/cufiles/power_layer.cu | 87 ---- src/caffe/layers/cufiles/prelu_layer.cu | 128 ------ src/caffe/layers/cufiles/reduction_layer.cu | 93 ----- src/caffe/layers/cufiles/relu_layer.cu | 65 --- .../sigmoid_cross_entropy_loss_layer.cu | 37 -- src/caffe/layers/cufiles/sigmoid_layer.cu | 62 --- src/caffe/layers/cufiles/silence_layer.cu | 28 -- src/caffe/layers/cufiles/slice_layer.cu | 71 ---- src/caffe/layers/cufiles/softmax_layer.cu | 149 ------- .../layers/cufiles/softmax_loss_layer.cu | 125 ------ src/caffe/layers/cufiles/split_layer.cu | 38 -- src/caffe/layers/cufiles/tanh_layer.cu | 59 --- src/caffe/layers/cufiles/threshold_layer.cu | 33 -- 46 files changed, 3518 deletions(-) delete mode 100644 src/caffe/layers/cudnn_conv_layer.cpp delete mode 100644 src/caffe/layers/cudnn_pooling_layer.cpp delete mode 100644 src/caffe/layers/cudnn_relu_layer.cpp delete mode 100644 src/caffe/layers/cudnn_sigmoid_layer.cpp delete mode 100644 src/caffe/layers/cudnn_softmax_layer.cpp delete mode 100644 src/caffe/layers/cudnn_tanh_layer.cpp delete mode 100644 src/caffe/layers/cufiles/absval_layer.cu delete mode 100644 src/caffe/layers/cufiles/base_data_layer.cu delete mode 100644 src/caffe/layers/cufiles/bnll_layer.cu delete mode 100644 src/caffe/layers/cufiles/concat_layer.cu delete mode 100644 src/caffe/layers/cufiles/contrastive_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/conv_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_conv_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_pooling_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_relu_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_softmax_layer.cu delete mode 100644 src/caffe/layers/cufiles/cudnn_tanh_layer.cu delete mode 100644 src/caffe/layers/cufiles/deconv_layer.cu delete mode 100644 src/caffe/layers/cufiles/dropout_layer.cu delete mode 100644 src/caffe/layers/cufiles/eltwise_layer.cu delete mode 100644 src/caffe/layers/cufiles/euclidean_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/exp_layer.cu delete mode 100644 src/caffe/layers/cufiles/filter_layer.cu delete mode 100644 src/caffe/layers/cufiles/hdf5_data_layer.cu delete mode 100644 src/caffe/layers/cufiles/hdf5_output_layer.cu delete mode 100644 src/caffe/layers/cufiles/im2col_layer.cu delete mode 100644 src/caffe/layers/cufiles/inner_product_layer.cu delete mode 100644 src/caffe/layers/cufiles/log_layer.cu delete mode 100644 src/caffe/layers/cufiles/lrn_layer.cu delete mode 100644 src/caffe/layers/cufiles/mvn_layer.cu delete mode 100644 src/caffe/layers/cufiles/pooling_layer.cu delete mode 100644 src/caffe/layers/cufiles/power_layer.cu delete mode 100644 src/caffe/layers/cufiles/prelu_layer.cu delete mode 100644 src/caffe/layers/cufiles/reduction_layer.cu delete mode 100644 src/caffe/layers/cufiles/relu_layer.cu delete mode 100644 src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/sigmoid_layer.cu delete mode 100644 src/caffe/layers/cufiles/silence_layer.cu delete mode 100644 src/caffe/layers/cufiles/slice_layer.cu delete mode 100644 src/caffe/layers/cufiles/softmax_layer.cu delete mode 100644 src/caffe/layers/cufiles/softmax_loss_layer.cu delete mode 100644 src/caffe/layers/cufiles/split_layer.cu delete mode 100644 src/caffe/layers/cufiles/tanh_layer.cu delete mode 100644 src/caffe/layers/cufiles/threshold_layer.cu diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 6071c49b..19458185 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -296,8 +296,6 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu_opt(input); - // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_, - // (Dtype*)transMem, 0, opt_num2); } } #ifdef multiQ @@ -391,8 +389,6 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, if (!is_1x1_) { conv_col2im_gpu_opt(input); - // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - // stride_w_, input, bottom_offset_, opt_num2); } } @@ -417,8 +413,6 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, cl_command_queue Queue; if (!is_1x1_) { conv_im2col_gpu_opt(input); - //im2col_gpu_opt(input, bottom_offset_, channels_, height_, - // width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); } //conv_transpose_gpu(output); int height_top = M_ * group_, width_top = N_; diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp deleted file mode 100644 index 104d2b9d..00000000 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// Set to three for the benefit of the backward pass, which -// can use separate streams for calculating the gradient w.r.t. -// bias, filter weights, and bottom data for each group independently -#define CUDNN_STREAMS_PER_GROUP 3 - -/** - * TODO(dox) explain cuDNN interface - */ -template -void CuDNNConvolutionLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::LayerSetUp(bottom, top); - // Initialize CUDA streams and cuDNN. - stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - workspaceSizeInBytes = 0; - workspace = NULL; - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - CUDA_CHECK(cudaStreamCreate(&stream_[g])); - CUDNN_CHECK(cudnnCreate(&handle_[g])); - CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); - } - - // Set the indexing parameters. - weight_offset_ = (this->num_output_ / this->group_) - * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; - bias_offset_ = (this->num_output_ / this->group_); - - // Create filter descriptor. - cudnn::createFilterDesc(&filter_desc_, - this->num_output_ / this->group_, this->channels_ / this->group_, - this->kernel_h_, this->kernel_w_); - - // Create tensor descriptor(s) for data and corresponding convolution(s). - for (int i = 0; i < bottom.size(); i++) { - cudnnTensorDescriptor_t bottom_desc; - cudnn::createTensor4dDesc(&bottom_desc); - bottom_descs_.push_back(bottom_desc); - cudnnTensorDescriptor_t top_desc; - cudnn::createTensor4dDesc(&top_desc); - top_descs_.push_back(top_desc); - cudnnConvolutionDescriptor_t conv_desc; - cudnn::createConvolutionDesc(&conv_desc); - conv_descs_.push_back(conv_desc); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::createTensor4dDesc(&bias_desc_); - } - - handles_setup_ = true; -} - -template -void CuDNNConvolutionLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::Reshape(bottom, top); - bottom_offset_ = (this->channels_ / this->group_) - * this->height_ * this->width_; - top_offset_ = (this->num_output_ / this->group_) - * this->height_out_ * this->width_out_; - - for (int i = 0; i < bottom.size(); i++) { - cudnn::setTensor4dDesc(&bottom_descs_[i], - this->num_, - this->channels_ / this->group_, - this->height_, this->width_, - this->channels_ * this->height_ * this->width_, - this->height_ * this->width_, - this->width_, 1); - cudnn::setTensor4dDesc(&top_descs_[i], - this->num_, - this->num_output_ / this->group_, - this->height_out_, this->width_out_, - this->num_output_ * this->height_out_ * this->width_out_, - this->height_out_ * this->width_out_, - this->width_out_, 1); - cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i], - filter_desc_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::setTensor4dDesc(&bias_desc_, - 1, this->num_output_ / this->group_, 1, 1); - } -} - -template -CuDNNConvolutionLayer::~CuDNNConvolutionLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - for (int i = 0; i < bottom_descs_.size(); i++) { - cudnnDestroyTensorDescriptor(bottom_descs_[i]); - cudnnDestroyTensorDescriptor(top_descs_[i]); - cudnnDestroyConvolutionDescriptor(conv_descs_[i]); - } - if (this->bias_term_) { - cudnnDestroyTensorDescriptor(bias_desc_); - } - cudnnDestroyFilterDescriptor(filter_desc_); - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - cudaStreamDestroy(stream_[g]); - cudnnDestroy(handle_[g]); - } - - delete [] stream_; - delete [] handle_; -} - -INSTANTIATE_CLASS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp deleted file mode 100644 index c92c4e47..00000000 --- a/src/caffe/layers/cudnn_pooling_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::LayerSetUp(bottom, top); - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - cudnn::createPoolingDesc(&pooling_desc_, - this->layer_param_.pooling_param().pool(), &mode_, - this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - handles_setup_ = true; -} - -template -void CuDNNPoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::Reshape(bottom, top); - cudnn::setTensor4dDesc(&bottom_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - cudnn::setTensor4dDesc(&top_desc_, bottom[0]->num(), - this->channels_, this->pooled_height_, this->pooled_width_); -} - -template -CuDNNPoolingLayer::~CuDNNPoolingLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroyPoolingDescriptor(pooling_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp deleted file mode 100644 index 759d8398..00000000 --- a/src/caffe/layers/cudnn_relu_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - ReLULayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { - ReLULayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNReLULayer::~CuDNNReLULayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp deleted file mode 100644 index 32637873..00000000 --- a/src/caffe/layers/cudnn_sigmoid_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSigmoidLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSigmoidLayer::~CuDNNSigmoidLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp deleted file mode 100644 index 77a3225a..00000000 --- a/src/caffe/layers/cudnn_softmax_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::LayerSetUp(bottom, top); - // Initialize CUDNN. - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::Reshape(bottom, top); - int N = this->outer_num_; - int K = bottom[0]->shape(this->softmax_axis_); - int H = this->inner_num_; - int W = 1; - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSoftmaxLayer::~CuDNNSoftmaxLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp deleted file mode 100644 index 376faad3..00000000 --- a/src/caffe/layers/cudnn_tanh_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - TanHLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNTanHLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - TanHLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNTanHLayer::~CuDNNTanHLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu deleted file mode 100644 index bb310e1a..00000000 --- a/src/caffe/layers/cufiles/absval_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void AbsValLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); -} - -template -void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu deleted file mode 100644 index 9335a5bc..00000000 --- a/src/caffe/layers/cufiles/base_data_layer.cu +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#include "caffe/data_layers.hpp" - -namespace caffe { - -template -void BasePrefetchingDataLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, join the thread - JoinPrefetchThread(); - // Reshape to loaded data. - top[0]->ReshapeLike(this->prefetch_data_); - // Copy the data - caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_gpu_data()); - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - // Copy the labels. - caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_gpu_data()); - } - // Start a new prefetch thread - CreatePrefetchThread(); -} - -INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu deleted file mode 100644 index d963d068..00000000 --- a/src/caffe/layers/cufiles/bnll_layer.cu +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -const float kBNLL_THRESHOLD = 50.; - -template -__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? - in[index] + log(1. + exp(-in[index])) : - log(1. + exp(in[index])); - } -} - -template -void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void BNLLBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD))); - out_diff[index] = in_diff[index] * expval / (expval + 1.); - } -} - -template -void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLBackward<<>>( - count, top_diff, bottom_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu deleted file mode 100644 index 8f2e85d8..00000000 --- a/src/caffe/layers/cufiles/concat_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Concat(const int nthreads, const Dtype* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } -} - -template -void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = true; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); - offset_concat_axis += bottom_concat_axis; - } -} - -template -void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = false; - for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); - offset_concat_axis += bottom_concat_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu deleted file mode 100644 index 93123931..00000000 --- a/src/caffe/layers/cufiles/contrastive_loss_layer.cu +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), - Dtype(0.0)); - loss += dist*dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -__global__ void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) { - CUDA_KERNEL_LOOP(i, count) { - int n = i / channels; // the num index, to access y and dist_sq - if (static_cast(y[n])) { // similar pairs - bottom_diff[i] = alpha * diff[i]; - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = (margin - dist_sq[n]); - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq[n]); - mdist = (margin - dist); - beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; - } - if (mdist > 0.0) { - bottom_diff[i] = beta; - } else { - bottom_diff[i] = 0; - } - } - } -} - -template -void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const int count = bottom[0]->count(); - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); - // NOLINT_NEXT_LINE(whitespace/operators) - CLLBackward<<>>( - count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu deleted file mode 100644 index b8a98ff7..00000000 --- a/src/caffe/layers/cufiles/conv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu deleted file mode 100644 index b4e802e1..00000000 --- a/src/caffe/layers/cufiles/cudnn_conv_layer.cu +++ /dev/null @@ -1,160 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -__global__ void sync_conv_groups() { } - -template -void CuDNNConvolutionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - - size_t workspace_limit_bytes = this->kernel_h_ * - this->kernel_w_ * - this->channels_ * - sizeof(int) + 1; - - // Forward through cuDNN in parallel over groups. - for (int g = 0; g < this->group_; g++) { - cudnnConvolutionFwdAlgo_t algo; - - // pick the convolution algorithm - // TODO(shelhamer) this should be done during reshape - // TODO(shelhamer) the choice of automatic or manual algorithm picking - // should be exposed in proto - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, // memoryLimitInBytes, - &algo)); - - // get minimum size of the workspace needed for the desired algorithm - size_t workspaceSizeInBytes_temp = 0; - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - algo, - &workspaceSizeInBytes_temp)); - - if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { - workspaceSizeInBytes = workspaceSizeInBytes_temp; - // free the existing workspace and allocate a new (larger) one - cudaFree(this->workspace); - cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); - if (err != cudaSuccess) { - // force zero memory path - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - workspace = NULL; - workspaceSizeInBytes = 0; - } - } - - // Filters. - CUDNN_CHECK(cudnnConvolutionForward(handle_[g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - filter_desc_, weight + weight_offset_ * g, - conv_descs_[i], - algo, workspace, workspaceSizeInBytes, - cudnn::dataType::zero, - top_descs_[i], top_data + top_offset_ * g)); - - // Bias. - if (this->bias_term_) { - const Dtype* bias_data = this->blobs_[1]->gpu_data(); - CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C, - cudnn::dataType::one, - bias_desc_, bias_data + bias_offset_ * g, - cudnn::dataType::one, - top_descs_[i], top_data + top_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -template -void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = NULL; - Dtype* weight_diff = NULL; - if (this->param_propagate_down_[0]) { - weight = this->blobs_[0]->gpu_data(); - weight_diff = this->blobs_[0]->mutable_gpu_diff(); - } - Dtype* bias_diff = NULL; - if (this->bias_term_ && this->param_propagate_down_[1]) { - bias_diff = this->blobs_[1]->mutable_gpu_diff(); - } - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Backward through cuDNN in parallel over groups and gradients. - for (int g = 0; g < this->group_; g++) { - // Gradient w.r.t. bias. - if (this->bias_term_ && this->param_propagate_down_[1]) { - CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g], - cudnn::dataType::one, - top_descs_[i], top_diff + top_offset_ * g, - cudnn::dataType::one, - bias_desc_, bias_diff + bias_offset_ * g)); - } - - // Gradient w.r.t. weights. - if (this->param_propagate_down_[0]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::one, - filter_desc_, weight_diff + weight_offset_ * g)); - } - - // Gradient w.r.t. bottom data. - if (propagate_down[i]) { - if (weight == NULL) { - weight = this->blobs_[0]->gpu_data(); - } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g], - cudnn::dataType::one, - filter_desc_, weight + weight_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::zero, - bottom_descs_[i], bottom_diff + bottom_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu deleted file mode 100644 index a952b855..00000000 --- a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu +++ /dev/null @@ -1,45 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNPoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu deleted file mode 100644 index 21d14857..00000000 --- a/src/caffe/layers/cufiles/cudnn_relu_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Forward_gpu(bottom, top); - } - - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Backward_gpu(top, propagate_down, bottom); - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu deleted file mode 100644 index 7a06cf72..00000000 --- a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu +++ /dev/null @@ -1,47 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNSigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu deleted file mode 100644 index a9e2fcef..00000000 --- a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNSoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu deleted file mode 100644 index d287f6fe..00000000 --- a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNTanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cufiles/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu deleted file mode 100644 index 39bc4de8..00000000 --- a/src/caffe/layers/cufiles/deconv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu deleted file mode 100644 index f9ea04f4..00000000 --- a/src/caffe/layers/cufiles/dropout_layer.cu +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/syncedmem.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - - -template -__global__ void DropoutForward(const int n, const Dtype* in, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] * (mask[index] > threshold) * scale; - } -} - -template -void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - unsigned int* mask = - static_cast(rand_vec_.mutable_gpu_data()); - caffe_gpu_rng_uniform(count, mask); - // set thresholds - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutForward<<>>( - count, bottom_data, mask, uint_thres_, scale_, top_data); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(count, bottom_data, top_data); - } -} - -template -__global__ void DropoutBackward(const int n, const Dtype* in_diff, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); - } -} - -template -void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const unsigned int* mask = - static_cast(rand_vec_.gpu_data()); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutBackward<<>>( - count, top_diff, mask, uint_thres_, scale_, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(top[0]->count(), top_diff, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu deleted file mode 100644 index 2247870d..00000000 --- a/src/caffe/layers/cufiles/eltwise_layer.cu +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype maxval = -FLT_MAX; - int maxidx = -1; - if (bottom_data_a[index] > bottom_data_b[index]) { - // only update for very first bottom_data blob (blob_idx == 0) - if (blob_idx == 0) { - maxval = bottom_data_a[index]; - top_data[index] = maxval; - maxidx = blob_idx; - mask[index] = maxidx; - } - } else { - maxval = bottom_data_b[index]; - top_data[index] = maxval; - maxidx = blob_idx + 1; - mask[index] = maxidx; - } - } -} - -template -void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int* mask = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.mutable_gpu_data(); - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward <<>>( - count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); - for (int i = 2; i < bottom.size(); ++i) { - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward<<>>( - count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } -} - -template -__global__ void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype gradient = 0; - if (mask[index] == blob_idx) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } -} - -template -void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } - if (!initialized) { - caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); - initialized = true; - } else { - caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - } - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.gpu_data(); - MaxBackward // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - count, top_diff, i, mask, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu deleted file mode 100644 index 5b1de3ad..00000000 --- a/src/caffe/layers/cufiles/euclidean_loss_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu deleted file mode 100644 index 2d75d8dd..00000000 --- a/src/caffe/layers/cufiles/exp_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } -} - -template -void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu deleted file mode 100644 index cf929eee..00000000 --- a/src/caffe/layers/cufiles/filter_layer.cu +++ /dev/null @@ -1,70 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->gpu_data(); - Dtype* top_data = top[t]->mutable_gpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * dim; - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } -} - -template -void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); ++i) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu deleted file mode 100644 index 5e3e4ced..00000000 --- a/src/caffe/layers/cufiles/hdf5_data_layer.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* -TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" -*/ - -#include -#include -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/data_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" - -namespace caffe { - -template -void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - current_file_ += 1; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu deleted file mode 100644 index ae497c34..00000000 --- a/src/caffe/layers/cufiles/hdf5_output_layer.cu +++ /dev/null @@ -1,43 +0,0 @@ -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - - for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); - } - SaveBlobs(); -} - -template -void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu deleted file mode 100644 index 9c338b14..00000000 --- a/src/caffe/layers/cufiles/im2col_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); - } -} - -template -void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu deleted file mode 100644 index d93560a0..00000000 --- a/src/caffe/layers/cufiles/inner_product_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., - bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); - if (bias_term_) { - caffe_gpu_gemm_ex(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.gpu_data(),0, - this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); - } -} - -template -void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm_ex(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - caffe_gpu_gemvv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, - (size_t)0, N_, reinterpret_cast(bias_multiplier_->gpu_data()), - (size_t)0, (Dtype)0., 1, - this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - caffe_gpu_gemm_ex(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., - bottom[0]->mutable_gpu_diff(), 0); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu deleted file mode 100644 index 847c86cd..00000000 --- a/src/caffe/layers/cufiles/log_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/neuron_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } -} - -template -void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(LogLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu deleted file mode 100644 index 001b3c34..00000000 --- a/src/caffe/layers/cufiles/lrn_layer.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const in_off = in + offset; - Dtype* const scale_off = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } -} - - -template -void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_gpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -// TODO: check if it would be faster to just put it into the previous kernel. -template -__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in, - const Dtype* const scale, const Dtype negative_beta, Dtype* const out) { - CUDA_KERNEL_LOOP(index, nthreads) { - out[index] = in[index] * pow(scale[index], negative_beta); - } -} - -template -void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale<<>>( - n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - CUDA_POST_KERNEL_CHECK; - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput<<>>( - n_threads, bottom_data, scale_data, -beta_, top_data); - CUDA_POST_KERNEL_CHECK; -} -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); - - -template -void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_gpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -template -__global__ void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const bottom_off = bottom_data + offset; - const Dtype* const top_off = top_data + offset; - const Dtype* const scale_off = scale + offset; - const Dtype* const top_diff_off = top_diff + offset; - Dtype* const bottom_diff_off = bottom_diff + offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - } -} - -template -void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff<<>>( - n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); -} -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); - - - -INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu deleted file mode 100644 index 3888a0c7..00000000 --- a/src/caffe/layers/cufiles/mvn_layer.cu +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) - caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 - caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - } -} - -template -void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_copy(temp_.count(), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu deleted file mode 100644 index ca4b13f7..00000000 --- a/src/caffe/layers/cufiles/pooling_layer.cu +++ /dev/null @@ -1,387 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxPoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data, int* mask, Dtype* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - Dtype maxval = -FLT_MAX; - int maxidx = -1; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_slice[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_slice[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } -} - -template -__global__ void AvePoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - Dtype aveval = 0; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } -} - -template -__global__ void StoPoolForwardTrain(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const rand_idx, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - Dtype cumsum = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_slice[h * width + w]; - return; - } - } - } - } -} - - -template -__global__ void StoPoolForwardTest(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems - Dtype cumsum = FLT_MIN; - Dtype cumvalues = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum; - } -} - - -template -void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; - Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->mutable_gpu_data(); - } else { - mask = max_idx_.mutable_gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - if (this->phase_ == TRAIN) { - // We need to create the random index as well. - caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); - } else { - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); - } - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -template -__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - Dtype gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = top_diff + offset; - if (mask) { - const int* const mask_slice = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } else { - const Dtype* const top_mask_slice = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } -} - -template -__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width + pad_w; - const int h = (index / width) % height + pad_h; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } -} - - -template -__global__ void StoPoolBackward(const int nthreads, - const Dtype* const rand_idx, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const rand_idx_slice = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff_slice[ph * pooled_width + pw] * - (index == static_cast(rand_idx_slice[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; - } -} - - -template -void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - caffe_gpu_set(count, Dtype(0.), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->gpu_data(); - } else { - mask = max_idx_.gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward<<>>( - count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward<<>>( - count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward<<>>( - count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu deleted file mode 100644 index 90d94405..00000000 --- a/src/caffe/layers/cufiles/power_layer.cu +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } -} - -template -void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu deleted file mode 100644 index e1f20048..00000000 --- a/src/caffe/layers/cufiles/prelu_layer.cu +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// CUDA kernele for forward -template -__global__ void PReLUForward(const int n, const int channels, const int dim, - const Dtype* in, Dtype* out, const Dtype* slope_data, - const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; - } -} - -// CUDA kernel for bottom backward -template -__global__ void PReLUBackward(const int n, const int channels, const int dim, - const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff, - const Dtype* slope_data, const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); - } -} - -// CUDA kernel for element-wise parameter backward -template -__global__ void PReLUParamBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); - } -} - -template -void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - const int div_factor = channel_shared_ ? channels : 1; - - // For in-place computation - if (top[0] == bottom[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); - } - - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUForward<<>>( - count, channels, dim, bottom_data, top_data, slope_data, div_factor); - CUDA_POST_KERNEL_CHECK; -} - -template -void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - // For in-place computation - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.gpu_data(); - } - - // Propagate to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); - int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward<<>>( - cdim, top_diff + top[0]->offset(n), - bottom_data + bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - if (channel_shared_) { - Dtype d; - caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } - if (channel_shared_) { - caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - int div_factor = channel_shared_ ? channels : 1; - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUBackward<<>>( - count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data, - div_factor); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu deleted file mode 100644 index 2dbd3bc9..00000000 --- a/src/caffe/layers/cufiles/reduction_layer.cu +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ReductionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } -} - -template -void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->gpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu deleted file mode 100644 index b8924c85..00000000 --- a/src/caffe/layers/cufiles/relu_layer.cu +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out, - Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; - } -} - -template -void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUForward<<>>( - count, bottom_data, top_data, negative_slope); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void ReLUBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * negative_slope); - } -} - -template -void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUBackward<<>>( - count, top_diff, bottom_data, bottom_diff, negative_slope); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu deleted file mode 100644 index 547fa80c..00000000 --- a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = bottom[1]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu deleted file mode 100644 index e1af0657..00000000 --- a/src/caffe/layers/cufiles/sigmoid_layer.cu +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = 1. / (1. + exp(-in[index])); - } -} - -template -void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void SigmoidBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - const Dtype sigmoid_x = out_data[index]; - out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); - } -} - -template -void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu deleted file mode 100644 index 8d044ee7..00000000 --- a/src/caffe/layers/cufiles/silence_layer.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Do nothing. -} - -template -void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu deleted file mode 100644 index 796841d3..00000000 --- a/src/caffe/layers/cufiles/slice_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Slice(const int nthreads, const Dtype* in_data, - const bool forward, const int num_slices, const int slice_size, - const int bottom_slice_axis, const int top_slice_axis, - const int offset_slice_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_slice_size = slice_size * top_slice_axis; - const int slice_num = index / total_slice_size; - const int slice_index = index % total_slice_size; - const int bottom_index = slice_index + - (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; - if (forward) { - out_data[index] = in_data[bottom_index]; - } else { - out_data[bottom_index] = in_data[index]; - } - } -} - -template -void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int offset_slice_axis = 0; - const Dtype* bottom_data = bottom[0]->gpu_data(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = true; - for (int i = 0; i < top.size(); ++i) { - Dtype* top_data = top[i]->mutable_gpu_data(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data); - offset_slice_axis += top_slice_axis; - } -} - -template -void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - int offset_slice_axis = 0; - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = false; - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff); - offset_slice_axis += top_slice_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu deleted file mode 100644 index 1f9c3a41..00000000 --- a/src/caffe/layers/cufiles/softmax_layer.cu +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } -} - -template -__global__ void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, count) { - out[index] = exp(data[index]); - } -} - -template -__global__ void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } -} - -template -__global__ void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } -} - -template -void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = bottom[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - // compute max - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_max<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // subtract - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); - // exponentiate - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp<<>>( - count, top_data, top_data); - // sum after exp - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // divide - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); -} - -template -void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = top[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, top_diff, bottom_diff); - // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_dot<<>>(outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); - // elementwise multiplication - caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu deleted file mode 100644 index 7e0f3da4..00000000 --- a/src/caffe/layers/cufiles/softmax_loss_layer.cu +++ /dev/null @@ -1,125 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - Dtype(FLT_MIN))); - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is not used for anything until it is overwritten - // on the backward pass, we use it here to avoid having to allocate new GPU - // memory to accumulate intermediate results in the kernel. - Dtype* loss_data = bottom[0]->mutable_gpu_diff(); - // Similarly, this memory is never used elsewhere, and thus we can use it - // to avoid having to allocate additional GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU<<>>(nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - Dtype loss; - caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; - } - top[0]->mutable_cpu_data()[0] = loss; - if (top.size() == 2) { - top[1]->ShareData(prob_); - } -} - -template -__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) { - const int channels = dim / spatial_dim; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is never used for anything else, - // we use to to avoid allocating new GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU<<>>(nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu deleted file mode 100644 index a4f5df26..00000000 --- a/src/caffe/layers/cufiles/split_layer.cu +++ /dev/null @@ -1,38 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } -} - -template -void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - if (top.size() == 1) { - caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); - return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu deleted file mode 100644 index ccd6e63e..00000000 --- a/src/caffe/layers/cufiles/tanh_layer.cu +++ /dev/null @@ -1,59 +0,0 @@ -// TanH neuron activation function layer. -// Adapted from ReLU layer code written by Yangqing Jia - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = tanh(in[index]); - } -} - -template -void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void TanHBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype tanhx = out_data[index]; - out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); - } -} - -template -void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/cufiles/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu deleted file mode 100644 index bfa7f159..00000000 --- a/src/caffe/layers/cufiles/threshold_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ThresholdForward(const int n, const Dtype threshold, - const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > threshold ? 1 : 0; - } -} - -template -void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - ThresholdForward<<>>( - count, threshold_, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer); - - -} // namespace caffe From 5c66e9b7eb4da81160ee25b94dcd3b6c89a5d1f8 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 2 Sep 2015 06:12:20 +0800 Subject: [PATCH 051/124] Removed forward_opt and backward_opt functions in conv layer --- include/caffe/vision_layers.hpp | 8 -- src/caffe/layers/base_conv_layer.cpp | 137 --------------------------- src/caffe/layers/conv_layer.cpp | 13 --- 3 files changed, 158 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 2f2d7eef..3ee5a779 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -140,10 +140,6 @@ class BaseConvolutionLayer : public Layer { //opencl related data structures protected: - void forward_gpu_opt(const vector*>& bottom, const Dtype* weight, - const vector*>& top, bool skip_im2col = false) ; - void backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); int opt_num2; int M_, N_, K_; int weight_offset_; @@ -223,12 +219,8 @@ class ConvolutionLayer : public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt(const vector*>& bottom, - const vector*>& top); virtual void Forward_gpu_opt2(const vector*>& bottom, const vector*>& top); - virtual void Backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 19458185..fc541ef9 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -448,143 +448,6 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, bias, (size_t)0, 1); } - -template -void BaseConvolutionLayer::forward_gpu_opt(const vector*>& bottom, const Dtype* weight, const vector*>& top, bool skip_im2col){ - - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - int M_org = M_ * group_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - int weight_offset = M_ * K_; - int opt_num2 = global_packing_N; - cl_command_queue Queue; - cl_event prof_event; - for (int n = 0; n < num_; n += opt_num2) { - opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - top_offset = M_ * N_ * opt_num2; - col_offset = K_ * N_ * opt_num2; - im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - -#ifdef multiQ - for (int g = 0; g < group_; ++g) { - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, - (Dtype)0., (Dtype*)subTopMem, top_offset * g); - } - if(group_ == 2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } -#else - Queue = amdDevice.CommandQueue; - for (int g = 0; g < group_; ++g) { - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g, - (Dtype)0., (Dtype*)subTopMem, top_offset * g); - } -#endif - transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2); - - for (int z = 0; z < opt_num2; z++) - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z); - } - } -} -} - -template -void BaseConvolutionLayer::backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); - for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, num_output_, N_, - (Dtype)1., top_diff, top[i]->offset(n), N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, - bias_diff, (size_t)0, 1); - } - } - - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - int col_offset = K_ * N_; - int top_offset = M_ * N_; - int weight_offset = M_ * K_; - int opt_num2 = global_packing_N; - int g = 0; - cl_command_queue Queue; - cl_event prof_event; - - for (int n = 0; n < num_; n += opt_num2) { - opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2; - top_offset = M_ * (N_ * opt_num2); - col_offset = K_ * (N_ * opt_num2); - im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_, - width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2); - - int height_top = M_ * group_, width_top = N_; - opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); - - for(g = 0; g < group_; ++g) { -#ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; -#else - Queue = amdDevice.CommandQueue; -#endif - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, - (Dtype)1., (Dtype*)subTopMem, top_offset * g, - (Dtype*)transMem, col_offset * g, (Dtype)1., - (Dtype*)weight_diff, weight_offset * g); - } - - if (propagate_down[i]) { - for (g = 0; g < group_; ++g) { -#ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; -#else - Queue = amdDevice.CommandQueue; -#endif - prof_event = caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_, - (Dtype)1., weight, weight_offset * g, - (Dtype*)subTopMem, top_offset * g, - (Dtype)0., (Dtype*)transMem, col_offset * g); - } - } - -#ifdef multiQ - if(group_ ==2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } -#endif - col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_, - stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2); - } - } - } -} - #endif // !CPU_ONLY INSTANTIATE_CLASS(BaseConvolutionLayer); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 0c3a1367..c5bdb02c 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -92,13 +92,6 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, } -template -void ConvolutionLayer::Forward_gpu_opt(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - this->forward_gpu_opt(bottom, weight, top); -} - template void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, const vector*>& top) { @@ -150,12 +143,6 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } } -template -void ConvolutionLayer::Backward_gpu_opt(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - this->backward_gpu_opt(top, propagate_down, bottom); -} - template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { From 7474975289f6146c1da7dcd18f184f6cad9638dd Mon Sep 17 00:00:00 2001 From: Junli Date: Fri, 4 Sep 2015 03:03:00 +0800 Subject: [PATCH 052/124] fixed merge conflicts --- src/caffe/layers/conv_layer.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index c5bdb02c..c829dbd7 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -66,13 +66,14 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } + } template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt(bottom, top); + Forward_gpu_opt2(bottom, top); else Forward_gpu_org(bottom, top); } @@ -84,12 +85,6 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, Backward_gpu_opt2(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); -// CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff"); - // CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff"); -// CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff"); - // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]"); - - } template @@ -118,6 +113,10 @@ void ConvolutionLayer::Forward_gpu_opt2(const vector*>& botto } } } + + //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); + } template @@ -143,6 +142,10 @@ void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom } } + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); +} + template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { @@ -184,8 +187,8 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, } } } -} +} template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { From 8469f863792f53bfb0e479be85a87b6ae1d19b5f Mon Sep 17 00:00:00 2001 From: Junli Date: Fri, 4 Sep 2015 13:56:34 +0800 Subject: [PATCH 053/124] clean up warining info --- src/caffe/device.cpp | 19 +---- src/caffe/layers/base_conv_layer.cpp | 10 +-- src/caffe/solver.cpp | 7 ++ src/caffe/util/im2col.cpp | 22 ++---- src/caffe/util/math_functions.cpp | 2 + src/caffe/util/ocl_util.cpp | 13 ++- src/caffe/util/ocl_wrapper.cpp | 113 ++++++++++----------------- 7 files changed, 68 insertions(+), 118 deletions(-) diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index dc47e907..86e63e45 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -33,17 +33,12 @@ #include namespace caffe { -//delete it after test, Yibing -cl_mem test_alloc_mem[10]; -extern long long unsigned device_mem_consumption; - char* buildOption = "-x clc++ "; //char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64"; std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; Device::~Device(){ - //clAmdBlasTeardown(); ReleaseKernels(); free((void*)platformIDs); free(DeviceIDs); @@ -57,7 +52,6 @@ Device::~Device(){ cl_int Device::Init(int deviceId){ - //Get Platform Infomation DisplayPlatformInfo(); clGetPlatformIDs(0, NULL, &numPlatforms); @@ -67,7 +61,7 @@ cl_int Device::Init(int deviceId){ size_t nameLen; cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); if(res != CL_SUCCESS){ - fprintf(stderr, "Err: Failed to Get Platform Info\n", res); + fprintf(stderr, "Err: Failed to Get Platform Info\n"); return 0; } platformName[nameLen] = 0; @@ -106,20 +100,17 @@ cl_int Device::Init(int deviceId){ } } - //Create Context Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); if(NULL == Context){ fprintf(stderr,"Err: Failed to Create Context\n"); return 0; } - //Create CommandQueue CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL); CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL); if(NULL == CommandQueue || NULL == CommandQueue_helper){ fprintf(stderr,"Err: Failed to Create Commandqueue\n"); return 0; } - //BuildProgram from OpenCL kernel files BuildProgram(oclKernelPath); row = clblasRowMajor; col = clblasColumnMajor; @@ -128,7 +119,6 @@ cl_int Device::Init(int deviceId){ void Device::BuildProgram(std::string kernel_dir) { - //Access opencl kernel files std::string strSource = ""; DIR *ocl_dir; struct dirent *dirp; @@ -159,7 +149,6 @@ void Device::BuildProgram(std::string kernel_dir) if(NULL == Program){ fprintf(stderr,"Err: Failed to create program\n"); } - //Build Program cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL); LOG(INFO) << "Build Program"; if(CL_SUCCESS != iStatus){ @@ -169,8 +158,6 @@ void Device::BuildProgram(std::string kernel_dir) std::cout << szBuildLog; clReleaseProgram(Program); } - - // return Program; } //Use to read OpenCL source code @@ -225,7 +212,6 @@ void Device::ReleaseKernels() void Device::DisplayPlatformInfo(){ cl_int err; - size_t size; err = clGetPlatformIDs (0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <=0) @@ -323,7 +309,6 @@ void Device::GetDeviceInfo(){ void Device::DeviceQuery() { - //Get Platform Infomation DisplayPlatformInfo(); clGetPlatformIDs(0, NULL, &numPlatforms); @@ -333,7 +318,7 @@ void Device::DeviceQuery() size_t nameLen; cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); if (res != CL_SUCCESS) { - fprintf(stderr, "Err: Failed to Get Platform Info\n", res); + fprintf(stderr, "Err: Failed to Get Platform Info\n"); return; } platformName[nameLen] = 0; diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index fc541ef9..26787393 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -292,7 +292,6 @@ template void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { cl_command_queue Queue; - cl_event prof_event; if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu_opt(input); @@ -302,7 +301,7 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; else Queue = amdDevice.CommandQueue_helper; - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } @@ -313,12 +312,11 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, #else Queue = amdDevice.CommandQueue; for (int g = 0; g < group_; ++g) { - prof_event = caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } #endif - //conv_transform_gpu((Dtype*)subTopMem, output); transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2); } @@ -414,9 +412,7 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, if (!is_1x1_) { conv_im2col_gpu_opt(input); } - //conv_transpose_gpu(output); - int height_top = M_ * group_, width_top = N_; - opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2); + opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); for (int g = 0; g < group_; ++g) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index f4b57a41..dde98baf 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -209,6 +209,13 @@ void Solver::Step(int iters) { blob->mutable_gpu_diff()); #else NO_GPU; +#endif + case Caffe::APU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); +#else + NO_GPU; #endif break; } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 29c6c1f9..089023b7 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -63,7 +63,6 @@ void im2col_cpu(const Dtype* data_im, const int channels, } } -// Explicit instantiation template void im2col_cpu(const float* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, @@ -99,7 +98,6 @@ void col2im_cpu(const Dtype* data_col, const int channels, } } -// Explicit instantiation template void col2im_cpu(const float* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, @@ -137,7 +135,7 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int chann ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -149,7 +147,6 @@ template void col2im_gpu_opt(const double* data_col, const int col_offse const int height, const int width, const int ksize, const int pad, const int stride, double* data_im, const int img_offset, int optnum); -//cannot use now, need to modify kernel. template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, @@ -182,10 +179,9 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col); ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); - clFinish(amdDevice.CommandQueue); } @@ -198,7 +194,6 @@ template void im2col_gpu(const double* data_im, const int img_offset, co const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col, const int col_offset); -//cannot use now, need to modify kernel template void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, const int width, const int channels, @@ -232,7 +227,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im); ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -270,7 +265,7 @@ void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, co ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col); ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); clFinish(amdDevice.CommandQueue); @@ -312,8 +307,8 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channe ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {num_kernels}; - size_t uiLocal_Work_Size[] = {256 - 256 % width_col}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; + size_t uiLocal_Work_Size[] = {(size_t)(256 - 256 % width_col)}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -334,9 +329,6 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; int num_kernels = channels * height * width; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operatiors) cl_int ret; ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); @@ -354,7 +346,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index d48ec01a..63b449da 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -720,11 +720,13 @@ void popcll_kernel(const int n, const double* a, template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, const float* y) { + return 0; } template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, const double* y) { + return 0; } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index e4fd42c6..7f9631e2 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -44,39 +44,36 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count){ err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); OCL_CHECK(err); - size_t Global_Work_Size[1] = {count}; + size_t Global_Work_Size[1] = {(size_t)count}; size_t Local_Work_Size[1] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -// Explicit instantiation template void ocl_memset(int* buffer, const int value, const int count); template void ocl_memset(float* buffer, const float value, const int count); template void ocl_memset(double* buffer, const double value, const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){ - cl_int err=0; + cl_int err; err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value); err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); OCL_CHECK(err); - size_t Global_Work_Size[] = {count}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } void eventCallback(cl_event event, cl_int event_status, void* user_data){ - printf("The calling\n"); - int err = 0; cl_ulong ev_start_time = (cl_ulong)0; cl_ulong ev_end_time = (cl_ulong)0; double run_time; - err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL); - err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL); + OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL) ); + OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL) ); run_time = (double)(ev_end_time - ev_start_time); printf("The kernel's running time is %f s\n", run_time * 1.0e-9); } diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 9eab08ec..6294cce3 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -35,21 +35,6 @@ namespace caffe { typedef unsigned int uint32_t; struct array4x32 { uint32_t v[4]; }; -/* -template inline std::string get_dtype_suffix() -{ - dtype x; - const char type = typeid(x).name()[0]; - std::string suffix; - switch(type){ - case 'i': suffix = "_int"; break; - case 'd': suffix = "_double"; break; - case 'f': - default: suffix = "_float"; - } - return suffix; -} -*/ template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold) { @@ -87,19 +72,14 @@ void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, c cl_int ret; ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src); - OCL_CHECK(ret); ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst); - OCL_CHECK(ret); ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset); - OCL_CHECK(ret); ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_); - OCL_CHECK(ret); ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_); - OCL_CHECK(ret); ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num); OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[]={M_ * packing_num}; + size_t uiGlobal_Work_Size2[]={(size_t)(M_ * packing_num)}; size_t uiLocal_Work_Size2[]={256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) ); } @@ -114,12 +94,11 @@ void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bo OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) ); OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) ); - size_t Global_Work_Size[1] = {num}; + size_t Global_Work_Size[1] = {(size_t)num}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data); template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data); @@ -130,12 +109,11 @@ void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){ OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); - size_t Global_Work_Size[1] = {num}; + size_t Global_Work_Size[1] = {(size_t)num}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation template void exp_gpu(cl_kernel Kernel, const int num, const float* data, float* out); template void exp_gpu(cl_kernel Kernel, const int num, const double* data, double* out); @@ -146,12 +124,11 @@ void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) ); OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); - size_t Global_Work_Size[1] = {num*dim}; + size_t Global_Work_Size[1] = {(size_t) (num * dim)}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data); template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data); @@ -175,7 +152,6 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p return loss; } -// Explicit instantiation template float softmax_gpu(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss); template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); @@ -192,7 +168,7 @@ void kernel_channel_max(const int num, const int channels, OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) ); - size_t Global_Work_Size[1] = {num*spatial_dim}; + size_t Global_Work_Size[1] = {(size_t) (num*spatial_dim)}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -217,7 +193,7 @@ void kernel_channel_subtract( const int count, OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) ); OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); - size_t Global_Work_Size[1] = {count}; + size_t Global_Work_Size[1] = {(size_t)count}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -239,7 +215,7 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out) OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); - size_t Global_Work_Size[1] = {count}; + size_t Global_Work_Size[1] = {(size_t)count}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -260,7 +236,7 @@ void kernel_channel_sum(const int num, const int channels, OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); - size_t Global_Work_Size[1] = {num*channels}; + size_t Global_Work_Size[1] = {(size_t)(num*channels)}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -282,7 +258,7 @@ void kernel_channel_div(const int count, const int num, const int channels, OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); - size_t Global_Work_Size[1] = {count}; + size_t Global_Work_Size[1] = {(size_t)count}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -307,7 +283,7 @@ void kernel_channel_dot(const int num, const int channels, OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) ); OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) ); - size_t Global_Work_Size[1] = {num*spatial_dim}; + size_t Global_Work_Size[1] = {(size_t)(num*spatial_dim)}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } @@ -339,7 +315,7 @@ void SoftmaxLossForwardGPU(const int nthreads, OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); - size_t Global_Work_Size[1] = {nthreads}; + size_t Global_Work_Size[1] = {(size_t)nthreads}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -369,7 +345,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); - size_t Global_Work_Size[1] = {nthreads}; + size_t Global_Work_Size[1] = {(size_t)nthreads}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -385,12 +361,11 @@ void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){ OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) ); OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); - size_t Global_Work_Size[1] = {num}; + size_t Global_Work_Size[1] = {(size_t)num}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation template void scal_gpu(cl_kernel Kernel, const int num, const float alpha, float* data); template void scal_gpu(cl_kernel Kernel, const int num, const double alpha, double* data); @@ -401,12 +376,11 @@ void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) ); - size_t Global_Work_Size[1] = {num}; + size_t Global_Work_Size[1] = {(size_t)num}; size_t Local_Work_Size[1] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); } -// Explicit instantiation template void diff_gpu(cl_kernel Kernel, const int num, const int dim, float* data, const float* label); template void diff_gpu(cl_kernel Kernel, const int num, const int dim, double* data, const double* label); @@ -426,7 +400,7 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -459,7 +433,7 @@ void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -489,7 +463,7 @@ void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int cl ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -517,7 +491,7 @@ void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int cln ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); @@ -547,7 +521,7 @@ void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {count * 1}; + size_t uiGlobal_Work_Size[] = {(size_t)count}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -571,7 +545,7 @@ void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {count * 1}; + size_t uiGlobal_Work_Size[] = {(size_t)count}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -597,7 +571,7 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {count}; + size_t uiGlobal_Work_Size[] = {(size_t)count}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -629,7 +603,7 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -661,7 +635,7 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -689,7 +663,7 @@ void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtyp ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {nthreads}; + size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } @@ -713,7 +687,7 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={count}; + size_t uiGlobal_Work_Size[]={(size_t)count}; size_t uiLocal_Work_Size[]={256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL)); } @@ -734,7 +708,7 @@ void PReLUForward(const int count, const int channels, const int dim, const Dtyp ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data); ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data); ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -754,7 +728,7 @@ void PReLUBackward(const int count, const int channels, const int dim, const Dty ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff); ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data); ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -770,7 +744,7 @@ void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bot ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -788,7 +762,7 @@ void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dty ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count * 1}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -809,7 +783,7 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {count}; + size_t uiGlobal_Work_Size[] = {(size_t)count}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -823,9 +797,6 @@ void opttrans(const Dtype* data_im, const int im_offset, const int channels, cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int num_kernels = channels * height * width * optnum; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operatiors) cl_int ret; ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); @@ -839,7 +810,7 @@ void opttrans(const Dtype* data_im, const int im_offset, const int channels, ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {num_kernels}; + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; size_t uiLocal_Work_Size[] = {256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); } @@ -866,7 +837,7 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k); ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={nthreads}; + size_t uiGlobal_Work_Size[]={(size_t)nthreads}; size_t uiLocal_Work_Size[]={256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) ); } @@ -889,7 +860,7 @@ void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta); ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[]={nthreads}; + size_t uiGlobal_Work_Size2[]={(size_t)nthreads}; size_t uiLocal_Work_Size2[]={256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); } @@ -920,7 +891,7 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio); ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={nthreads}; + size_t uiGlobal_Work_Size[]={(size_t)nthreads}; size_t uiLocal_Work_Size[]={256}; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) ); } @@ -945,7 +916,7 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; + size_t Global_Work_Size[] = {(size_t)n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -960,7 +931,7 @@ void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); OCL_CHECK(ret); - size_t Global_Work_Size[] = {N}; + size_t Global_Work_Size[] = {(size_t)N}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -978,7 +949,7 @@ void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; + size_t Global_Work_Size[] = {(size_t)n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -995,7 +966,7 @@ void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; + size_t Global_Work_Size[] = {(size_t)n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -1014,7 +985,7 @@ void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; + size_t Global_Work_Size[] = {(size_t)n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -1032,7 +1003,7 @@ void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); OCL_CHECK(ret); - size_t Global_Work_Size[] = {n}; + size_t Global_Work_Size[] = {(size_t)n}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -1054,7 +1025,7 @@ void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMe ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -1077,7 +1048,7 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); - size_t Global_Work_Size[] = {count}; + size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } From 9cf71bbc7505cb794dadde0d5bd6e47399c421ec Mon Sep 17 00:00:00 2001 From: Junli Date: Fri, 4 Sep 2015 15:58:04 +0800 Subject: [PATCH 054/124] Remove the annotation code --- include/caffe/util/device_alternate.hpp | 59 ----------------------- include/caffe/util/math_functions.hpp | 3 -- include/caffe/util/ocl_wrapper.hpp | 2 - src/caffe/common.cpp | 40 +--------------- src/caffe/device.cpp | 1 - src/caffe/layers/base_data_layer.cpp | 15 ------ src/caffe/layers/dropout_layer.cpp | 13 ----- src/caffe/layers/pooling_layer.cpp | 13 ----- src/caffe/layers/relu_layer.cpp | 13 ----- src/caffe/layers/window_data_layer.cpp | 33 ------------- src/caffe/net.cpp | 1 - src/caffe/solver.cpp | 16 ------- src/caffe/util/benchmark.cpp | 14 ------ src/caffe/util/im2col.cu | 10 ---- src/caffe/util/math_functions.cpp | 2 - src/caffe/util/ocl_wrapper.cpp | 63 ------------------------- 16 files changed, 1 insertion(+), 297 deletions(-) diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index 9184f4f9..bf5d7705 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -31,70 +31,11 @@ void classname::funcname##_##gpu(const vector*>& top, \ #else // Normal GPU + CPU Caffe. -//#include -//#include -//#include -//#include -//#include // cuda driver types #ifdef USE_CUDNN // cuDNN acceleration library. #include "caffe/util/cudnn.hpp" #endif -// -// CUDA macros -// - -// CUDA: various checks for different function calls. -/* -#define CUDA_CHECK(condition) \ - do { \ - cudaError_t error = condition; \ - CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ - } while (0) - -#define CUBLAS_CHECK(condition) \ - do { \ - cublasStatus_t status = condition; \ - CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ - << caffe::cublasGetErrorString(status); \ - } while (0) - -#define CURAND_CHECK(condition) \ - do { \ - curandStatus_t status = condition; \ - CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ - << caffe::curandGetErrorString(status); \ - } while (0) - -// CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -// CUDA: check for error after kernel execution and exit loudly if there is one. -#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) -*/ namespace caffe { - -// CUDA: library error reporting. -//const char* cublasGetErrorString(cublasStatus_t error); -//const char* curandGetErrorString(curandStatus_t error); - -// CUDA: thread number configuration. -// Use 1024 threads per block, which requires cuda sm_2x or above, -// or fall back to attempt compatibility (best of luck to you). -#if __CUDA_ARCH__ >= 200 - const int CAFFE_CUDA_NUM_THREADS = 1024; -#else - const int CAFFE_CUDA_NUM_THREADS = 512; -#endif - -// CUDA: number of blocks for threads. -inline int CAFFE_GET_BLOCKS(const int N) { - return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS; -} - } // namespace caffe #endif // CPU_ONLY diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 381dd8fd..46949ff3 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -160,9 +160,6 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); template void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -//template -//void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, Dtype* scratch_buf); -//CUDA version, need to be deleted template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 223e3278..db19e1b2 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -30,8 +30,6 @@ namespace caffe { typedef unsigned int uint32_t; -//template -//void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); template inline std::string get_dtype_suffix() { diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 83afe272..3e4e0dc0 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -12,7 +12,6 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { //To fix: for now we use fixed seed to get same result each time - /* int64_t s, seed, pid; FILE* f = fopen("/dev/urandom", "rb"); if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { @@ -28,8 +27,7 @@ int64_t cluster_seedgen(void) { pid = getpid(); s = time(NULL); seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; - */ + //return seed; LOG(WARNING) << "return fixed seed 37"; return 37; } @@ -93,21 +91,6 @@ void* Caffe::RNG::generator() { Caffe::Caffe() { -/* : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), - mode_(Caffe::CPU) { - // Try to create a cublas handler, and report an error if failed (but we will - // keep the program running as one might just want to run CPU code). - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; - } - // Try to create a curand handler. - if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) - != CURAND_STATUS_SUCCESS || - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) - != CURAND_STATUS_SUCCESS) { - LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; - } -*/ #ifndef CPU_ONLY cl_int err = clblasSetup(); if(err != CL_SUCCESS){ @@ -117,33 +100,12 @@ Caffe::Caffe() } Caffe::~Caffe() { - /* if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } -*/ #ifndef CPU_ONLY clblasTeardown(); #endif } void Caffe::set_random_seed(const unsigned int seed) { - // Curand seed - /* static bool g_curand_availability_logged = false; - if (Get().curand_generator_) { - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(), - seed)); - CURAND_CHECK(curandSetGeneratorOffset(curand_generator(), 0)); - } else { - if (!g_curand_availability_logged) { - LOG(ERROR) << - "Curand not available. Skipping setting the curand seed."; - g_curand_availability_logged = true; - } - } - // RNG seed - Get().random_generator_.reset(new RNG(seed)); -*/ } void Caffe::SetDevice(const int device_id) { diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 86e63e45..df2de2e0 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -34,7 +34,6 @@ namespace caffe { char* buildOption = "-x clc++ "; -//char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64"; std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 71f5c132..5ba0f2e5 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -87,8 +87,6 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo JoinPrefetchThread(); DLOG(INFO) << "Thread joined"; - // Copy the data from prefetch thread to data_layer - //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) ); top[0]->ReshapeLike(this->prefetch_data_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); @@ -97,29 +95,16 @@ void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bo // Reshape to loaded labels. top[1]->ReshapeLike(prefetch_label_); OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); - //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) ); } -// clFinish(amdDevice.CommandQueue); - #ifdef Track_data_transfer #endif -// CHECK_BLOB_DATA(top[0], 20, "top[0]"); - // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; CreatePrefetchThread(); - //return Dtype(0.); } -/*template -void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ -}*/ - - - #ifdef CPU_ONLY STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); #endif diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index dfd6560d..9f630e8d 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -95,15 +95,9 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else -// caffe_gpu_rng_uniform(count, mask); caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #endif - // set thresholds - // NOLINT_NEXT_LINE(whitespace/operators) -// DropoutForward<<>>( - // count, bottom_data, mask, uint_thres_, scale_, top_data); - // CUDA_POST_KERNEL_CHECK; } else { caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data); } @@ -118,14 +112,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); if (this->phase_ == TRAIN) { - //const unsigned int* mask = - // static_cast(rand_vec_.gpu_data()); const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - // DropoutBackward<<>>( - // count, top_diff, mask, uint_thres_, scale_, bottom_diff); - // CUDA_POST_KERNEL_CHECK; DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); } else { caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 83b18c89..ff86400b 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -333,25 +333,12 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, mask, top_mask); - /* - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask);*/ break; case PoolingParameter_PoolMethod_AVE: // NOLINT_NEXT_LINE(whitespace/operators) AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - /* - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);*/ break; case PoolingParameter_PoolMethod_STOCHASTIC: if (this->phase_ == TRAIN) { diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index c38814f1..784d2c91 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -43,15 +43,6 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - // ReLUForward<<>>( - // count, bottom_data, top_data, negative_slope); - //CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; ReLUForward(count,bottom_data,top_data,negative_slope); } @@ -66,10 +57,6 @@ void ReLULayer::Backward_gpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) -// ReLUBackward<<>>( - // count, top_diff, bottom_data, bottom_diff, negative_slope); - // CUDA_POST_KERNEL_CHECK; ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope); } } diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index c127d56b..cc7dc79d 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -418,39 +418,6 @@ void WindowDataLayer::InternalThreadEntry() { // get window label top_label[item_id] = window[WindowDataLayer::LABEL]; - #if 0 - // useful debugging code for dumping transformed windows to disk - string file_id; - std::stringstream ss; - ss << PrefetchRand(); - ss >> file_id; - std::ofstream inf((string("dump/") + file_id + - string("_info.txt")).c_str(), std::ofstream::out); - inf << image.first << std::endl - << window[WindowDataLayer::X1]+1 << std::endl - << window[WindowDataLayer::Y1]+1 << std::endl - << window[WindowDataLayer::X2]+1 << std::endl - << window[WindowDataLayer::Y2]+1 << std::endl - << do_mirror << std::endl - << top_label[item_id] << std::endl - << is_fg << std::endl; - inf.close(); - std::ofstream top_data_file((string("dump/") + file_id + - string("_data.txt")).c_str(), - std::ofstream::out | std::ofstream::binary); - for (int c = 0; c < channels; ++c) { - for (int h = 0; h < crop_size; ++h) { - for (int w = 0; w < crop_size; ++w) { - top_data_file.write(reinterpret_cast( - &top_data[((item_id * channels + c) * crop_size + h) - * crop_size + w]), - sizeof(Dtype)); - } - } - } - top_data_file.close(); - #endif - item_id++; } } diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index f5d0e703..e070d774 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -35,7 +35,6 @@ Net::Net(const string& param_file, Phase phase) { template void Net::Init(const NetParameter& in_param) { // Set phase from the state. - //amdDevice.Init(); phase_ = in_param.state().phase(); // Filter layers based on their include/exclude rules and // the current NetState. diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index dde98baf..6e1a40a7 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -27,15 +27,6 @@ void Solver::ocl_setup(){ powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } -//template -/*Solver::~Solver(){ - OCL_CHECK( clReleaseKernel(scalar_kernel) ); - OCL_CHECK( clReleaseKernel(add_kernel) ); - OCL_CHECK( clReleaseKernel(div_kernel) ); - OCL_CHECK( clReleaseKernel(powx_kernel) ); -} -*/ - template Solver::Solver(const string& param_file) : net_() { @@ -51,14 +42,7 @@ void Solver::Init(const SolverParameter& param) { param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; -//#ifndef CPU_ONLY - //AMD device related initialization - //amdDevice.Init(); ocl_setup(); -// cl_int err = clblasSetup(); -//#else -// NO_GPU; -//#endif if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 6942f8a3..0383fd27 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -13,14 +13,6 @@ Timer::Timer() } Timer::~Timer() { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // CUDA_CHECK(cudaEventDestroy(start_gpu_)); - // CUDA_CHECK(cudaEventDestroy(stop_gpu_)); -#else - NO_GPU; -#endif - } } void Timer::Start() { @@ -72,12 +64,6 @@ float Timer::Seconds() { void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // CUDA_CHECK(cudaEventCreate(&start_gpu_)); - // CUDA_CHECK(cudaEventCreate(&stop_gpu_)); -#else - NO_GPU; -#endif } initted_ = true; } diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index c90f93eb..d52acb54 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -88,16 +88,6 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, int w_col_end = min(w / stride_w + 1, width_col); int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; int h_col_end = min(h / stride_h + 1, height_col); - /* - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize - + (w - w_col * stride_w); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - */ // equivalent implementation int offset = (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 63b449da..57fc9fd4 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -540,8 +540,6 @@ double caffe_cpu_dot(const int n, const double* x, const double* y) { template <> void caffe_gpu_dot(const int n, const float* x, const float* y, float* out) { - //need to pass in scratchBuff - //AMDBLAS_CHECK(clAmdBlasSdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 6294cce3..ccaf60df 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1058,69 +1058,6 @@ template void DropoutBackward(const int count, const double* top_diff, c template void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) { -/* std::string kernel_name = "Conv" + get_dtype_suffix(); - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - - int weights_stride = kernel_w * kernel_h;//correct? - int bot_stride = width; - int bot_channel_stride = width * height; - int bot_batch_stride = width * height * channel_in; - - int top_stride = width_out; - int top_channel_stride = width_out * height_out; - int top_batch_stride = width_out * height_out * channel_out; - - //int height_out = (int)top->getDim(ANN_TENSOR_HEIGHT); - //int width_out = (int)top->getDim(ANN_TENSOR_WIDTH); - int vis_height = height_out * stride - 2 * pad; - int vis_width = width_out * stride - 2 * pad; - - int ocl_group_sz0_ = 8; - int ocl_group_sz1_ = 8; - int ocl_group_lg2sz1_ = (int)ceil(log((double)ocl_group_sz1_)/log(2.)); - int ocl_group_lg2sz0_ = (int)ceil(log((double)ocl_group_sz0_)/log(2.)); - - int outputs = channel_out; - int n_out_pix_horiz_ = (width_out < 2 * ocl_group_sz0_) ? 1 : (width_out < 4 * ocl_group_sz0_) ? 2 : 4; - int n_out_pix_vert_ = (height_out < 2 * ocl_group_sz1_) ? 1 : 2; // (height_out <= 192) ? 2 : 4; - int n_outs_ = ((outputs & 1) == 1) ? 1 : (kernel_w == 3) && ((outputs / 4) * 4 == outputs) ? 4 : 2; // (n_out_pix_horiz_ >= 4) ? 1 : 2; - - int n_outputs = channel_out; - n_outputs /= n_outs_; - int i_n_group_horiz = (width_out + ocl_group_sz0_ * n_out_pix_horiz_ - 1) / (ocl_group_sz0_ * n_out_pix_horiz_); - int i_n_group_vert = (height_out + ocl_group_sz1_ * n_out_pix_vert_ - 1) / (ocl_group_sz1_ * n_out_pix_vert_); - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&weights); - ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&bias); - ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(ker_rand, 1, sizeof(cl_int), (void*)&kernel_w); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&channel_out); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&channel_in); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&pad); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_out_pix_horiz_); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_out_pix_vert_); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_batch_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_channel_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&bot_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_batch_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_channel_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&top_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&vis_width); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&vis_height); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&weights_stride); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&width_out); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&height_out); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int), (void*)&n_outs_); - OCL_CHECK(ret); - - size_t l_wk[3] = { ocl_group_sz0_, ocl_group_sz1_, 1}; - size_t g_wk[3] = { i_n_group_horiz * l_wk[0], i_n_group_vert * l_wk[1], batch_sz * n_outputs }; - - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );*/ } template void ocl_conv(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); From dce5407594dbf06b7b99b1dfa5a00eed9ea99352 Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 4 Sep 2015 23:00:30 +0800 Subject: [PATCH 055/124] Partially get through unit test --- include/caffe/common.hpp | 2 +- include/caffe/test/test_caffe_main.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 7 ++-- src/caffe/layers/dropout_layer.cpp | 4 +-- src/caffe/layers/exp_layer.cpp | 21 +++++++++++ src/caffe/layers/split_layer.cpp | 2 +- src/caffe/ocl/pooling_layer.cl | 2 +- src/caffe/ocl/util.cl | 11 +++++- src/caffe/solver.cpp | 2 +- src/caffe/test/test_caffe_main.cpp | 13 +++---- src/caffe/test/test_common.cpp | 7 ++-- src/caffe/test/test_filter_layer.cpp | 4 +-- src/caffe/test/test_inner_product_layer.cpp | 14 ++++---- src/caffe/test/test_platform.cpp | 6 ++-- src/caffe/test/test_util_blas.cpp | 8 +++-- src/caffe/util/math_functions.cpp | 30 +++++++++++----- src/caffe/util/ocl_wrapper.cpp | 39 ++++++++++++++++----- 17 files changed, 123 insertions(+), 51 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index c5bf909d..8c738ca3 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -81,7 +81,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 0 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp index fc156091..b4f8f284 100644 --- a/include/caffe/test/test_caffe_main.hpp +++ b/include/caffe/test/test_caffe_main.hpp @@ -17,7 +17,7 @@ using std::endl; #ifdef CMAKE_BUILD #include "caffe_config.h" #else - #define CUDA_TEST_DEVICE -1 + #define OPENCL_TEST_DEVICE -1 #define CMAKE_SOURCE_DIR "src/" #define EXAMPLES_SOURCE_DIR "examples/" #define CMAKE_EXT "" diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 223e3278..6a019895 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -134,7 +134,10 @@ template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); template -void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ); +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ); template void kernel_channel_max(const int num, const int channels, @@ -174,7 +177,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, const int ignore_label_, Dtype* counts); template -void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y); +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index dfd6560d..3a060388 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -17,7 +17,7 @@ void DropoutLayer::ocl_setup(int bottom_count){ template DropoutLayer::~DropoutLayer(){ - OCL_CHECK( clReleaseMemObject(MaskMem) ); +// OCL_CHECK( clReleaseMemObject(MaskMem) ); } @@ -105,7 +105,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, // count, bottom_data, mask, uint_thres_, scale_, top_data); // CUDA_POST_KERNEL_CHECK; } else { - caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data); + caffe_gpu_copy(count, bottom_data, top_data); } } diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 547fca6a..5e7819c0 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -62,11 +62,32 @@ void ExpLayer::Backward_cpu(const vector*>& top, template void ExpLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_gpu_exp(count, bottom_data, top_data); + } else { + caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_gpu_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_gpu_scal(count, outer_scale_, top_data); + } } template void ExpLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (!propagate_down[0]) { return; } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_gpu_scal(count, inner_scale_, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index af8a9123..4b60db10 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -66,7 +66,7 @@ void SplitLayer::Backward_gpu(const vector*>& top, caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); return; } - caffe_gpu_add(gpu_add_kernel, count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), bottom[0]->mutable_gpu_diff()); // Add remaining top blob diffs. for (int i = 2; i < top.size(); ++i) { diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index b6a5a0a1..10d3b9f5 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -245,7 +245,7 @@ template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AveP template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template -void StoPoolBackward(const int nthreads, +__kernel void StoPoolBackward(const int nthreads, __global Dtype* rand_idx, __global Dtype* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index 9710a343..cda05652 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -49,13 +49,22 @@ template __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ int gdx = get_global_id(0); if(gdx < N){ - Y[gdx] =((0.00.0)-(X[gdx]<0.0)); } } template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y); +template +__kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) { + int index = get_global_id(0); + if(index < n) { + y[index] = fabs(a[index]); + } +} +template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y); +template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y); template __kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index dde98baf..643e696c 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -780,7 +780,7 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->update_[param_id]->mutable_gpu_data()); // update history - caffe_gpu_add(add_kernel, net_params[param_id]->count(), + caffe_gpu_add(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->history_[param_id]->mutable_gpu_data()); diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp index 5f41d325..278d520c 100644 --- a/src/caffe/test/test_caffe_main.cpp +++ b/src/caffe/test/test_caffe_main.cpp @@ -7,12 +7,12 @@ namespace caffe { #ifndef CPU_ONLY - cudaDeviceProp CAFFE_TEST_CUDA_PROP; + //cudaDeviceProp CAFFE_TEST_CUDA_PROP; #endif } #ifndef CPU_ONLY -using caffe::CAFFE_TEST_CUDA_PROP; +//using caffe::CAFFE_TEST_CUDA_PROP; #endif @@ -23,15 +23,16 @@ int main(int argc, char** argv) { // Before starting testing, let's first print out a few cuda defice info. int device = 0; // cudaGetDeviceCount(&device); - cout << "Cuda number of devices: " << device << endl; + // cout << "Cuda number of devices: " << device << endl; if (argc > 1) { // Use the given device device = atoi(argv[1]); - cudaSetDevice(device); + // cudaSetDevice(device); + caffe::amdDevice.Init(device); cout << "Setting to use device " << device << endl; - } else if (CUDA_TEST_DEVICE >= 0) { + } else if (OPENCL_TEST_DEVICE >= 0) { // Use the device assigned in build configuration; but with a lower priority - device = CUDA_TEST_DEVICE; + device = OPENCL_TEST_DEVICE; } // cudaGetDevice(&device); cout << "Current device id: " << device << endl; diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp index b3a61b0f..6c80de1d 100644 --- a/src/caffe/test/test_common.cpp +++ b/src/caffe/test/test_common.cpp @@ -14,12 +14,13 @@ class CommonTest : public ::testing::Test {}; #ifndef CPU_ONLY // GPU Caffe singleton test. +/* TEST_F(CommonTest, TestCublasHandlerGPU) { int cuda_device_id; CUDA_CHECK(cudaGetDevice(&cuda_device_id)); EXPECT_TRUE(Caffe::cublas_handle()); } - +*/ #endif TEST_F(CommonTest, TestBrewMode) { @@ -45,7 +46,7 @@ TEST_F(CommonTest, TestRandSeedCPU) { } #ifndef CPU_ONLY // GPU Caffe singleton test. - +/* TEST_F(CommonTest, TestRandSeedGPU) { SyncedMemory data_a(10 * sizeof(unsigned int)); SyncedMemory data_b(10 * sizeof(unsigned int)); @@ -60,7 +61,7 @@ TEST_F(CommonTest, TestRandSeedGPU) { ((const unsigned int*)(data_b.cpu_data()))[i]); } } - +*/ #endif } // namespace caffe diff --git a/src/caffe/test/test_filter_layer.cpp b/src/caffe/test/test_filter_layer.cpp index c641b6ef..801881e9 100644 --- a/src/caffe/test/test_filter_layer.cpp +++ b/src/caffe/test/test_filter_layer.cpp @@ -13,7 +13,7 @@ #include "caffe/test/test_gradient_check_util.hpp" namespace caffe { - +/* template class FilterLayerTest : public MultiDeviceTest { typedef typename TypeParam::Dtype Dtype; @@ -124,5 +124,5 @@ TYPED_TEST(FilterLayerTest, TestGradient) { checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, this->blob_top_vec_, 0); } - +*/ } // namespace caffe diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp index c03df173..f0c36b13 100644 --- a/src/caffe/test/test_inner_product_layer.cpp +++ b/src/caffe/test/test_inner_product_layer.cpp @@ -13,9 +13,9 @@ namespace caffe { -#ifndef CPU_ONLY -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; -#endif +//#ifndef CPU_ONLY +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//#endif template class InnerProductLayerTest : public MultiDeviceTest { @@ -59,10 +59,10 @@ TYPED_TEST(InnerProductLayerTest, TestForward) { typedef typename TypeParam::Dtype Dtype; bool IS_VALID_CUDA = false; #ifndef CPU_ONLY - IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; + // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; #endif if (Caffe::mode() == Caffe::CPU || - sizeof(Dtype) == 4 || IS_VALID_CUDA) { + sizeof(Dtype) == 4 ) { LayerParameter layer_param; InnerProductParameter* inner_product_param = layer_param.mutable_inner_product_param(); @@ -89,10 +89,10 @@ TYPED_TEST(InnerProductLayerTest, TestGradient) { typedef typename TypeParam::Dtype Dtype; bool IS_VALID_CUDA = false; #ifndef CPU_ONLY - IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; + // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; #endif if (Caffe::mode() == Caffe::CPU || - sizeof(Dtype) == 4 || IS_VALID_CUDA) { + sizeof(Dtype) == 4 ) { LayerParameter layer_param; InnerProductParameter* inner_product_param = layer_param.mutable_inner_product_param(); diff --git a/src/caffe/test/test_platform.cpp b/src/caffe/test/test_platform.cpp index f3513e08..7a30c2db 100644 --- a/src/caffe/test/test_platform.cpp +++ b/src/caffe/test/test_platform.cpp @@ -10,10 +10,10 @@ namespace caffe { -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; class PlatformTest : public ::testing::Test {}; - +/* TEST_F(PlatformTest, TestInitialization) { printf("Major revision number: %d\n", CAFFE_TEST_CUDA_PROP.major); printf("Minor revision number: %d\n", CAFFE_TEST_CUDA_PROP.minor); @@ -51,7 +51,7 @@ TEST_F(PlatformTest, TestInitialization) { (CAFFE_TEST_CUDA_PROP.unifiedAddressing ? "Yes" : "No")); EXPECT_TRUE(true); } - +*/ } // namespace caffe #endif // CPU_ONLY diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp index 8770f309..9cc9558c 100644 --- a/src/caffe/test/test_util_blas.cpp +++ b/src/caffe/test/test_util_blas.cpp @@ -12,7 +12,7 @@ namespace caffe { -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; template class GemmTest : public ::testing::Test {}; @@ -30,7 +30,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { caffe_copy(6, data, A.mutable_cpu_data()); caffe_copy(12, data, B.mutable_cpu_data()); - if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + if (sizeof(TypeParam) == 4) { // [1, 2, 3; 4 5 6] * [1, 2, 3, 4; 5, 6, 7, 8; 9, 10, 11, 12]; caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1., A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data()); @@ -100,7 +101,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) { caffe_copy(6, data, A.mutable_cpu_data()); caffe_copy(3, data, x.mutable_cpu_data()); - if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + if (sizeof(TypeParam) == 4) { caffe_cpu_gemv(CblasNoTrans, 2, 3, 1., A.cpu_data(), x.cpu_data(), 0., y.mutable_cpu_data()); for (int i = 0; i < 2; ++i) { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 63b449da..fef8aa34 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -227,11 +227,13 @@ void caffe_gpu_sgnbit(const int n, const double* x, double* y) template<> void caffe_gpu_abs(const int n, const float* x, float* y) { + caffe_gpu_abs_ocl(n, x, y); } template<> void caffe_gpu_abs(const int n, const double* x, double* y) { + caffe_gpu_abs_ocl(n, x, y); } template <> @@ -288,14 +290,16 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { - if(X != Y) + if(X != Y){ CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + } } template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { - if(X != Y) + if(X != Y){ CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + } } template <> @@ -622,11 +626,15 @@ void caffe_cpu_scale(const int n, const double alpha, const double *x, template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, float* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, double* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } template @@ -664,18 +672,24 @@ void mul_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { } +template <> +void caffe_gpu_exp(const int N, const float* a, float* y) { + kernel_exp(N, a, y); +} + +template <> +void caffe_gpu_exp(const int N, const double* a, double* y) { + kernel_exp(N, a, y); +} + template<> void caffe_gpu_sign(const int N, const float *X, float *Y){ - cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); - caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); - clReleaseKernel(caffe_gpu_sign_kernel); + caffe_gpu_sign_ocl(N, X, Y); } template<> void caffe_gpu_sign(const int N, const double *X, double *Y){ - cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL); - caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y); - clReleaseKernel(caffe_gpu_sign_kernel); + caffe_gpu_sign_ocl(N, X, Y); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 6294cce3..757a485b 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -50,9 +50,9 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dty cl_int ret; ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float), (void*)&inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float), (void*)&sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float), (void*)&threshold); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*)&threshold); ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&nrounds); ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*)&size); OCL_CHECK(ret); @@ -909,7 +909,9 @@ template void LRNComputeDiff(cl_kernel kernel, const int nthreads, const double cache_ratio, double* const bottom_diff); template -void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1); @@ -921,11 +923,30 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_add (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y); -template void caffe_gpu_add (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y); +template void caffe_gpu_add (const int n, const float* in1, const float* in2, float* y); +template void caffe_gpu_add (const int n, const double* in1, const double* in2, double* y); template -void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ){ + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)N}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y ); +template void caffe_gpu_sign_ocl(const int N, const double* X, double* Y ); + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ){ + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); @@ -936,8 +957,8 @@ void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y ){ OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void caffe_gpu_sign(cl_kernel Kernel,const int N, const float* X, float* Y ); -template void caffe_gpu_sign(cl_kernel Kernel,const int N, const double* X, double* Y ); +template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y ); +template void caffe_gpu_abs_ocl(const int N, const double* X, double* Y ); template void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ From c9b345ffae6ee09aed7f56b15920cd6646b0c4f2 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 5 Sep 2015 00:28:41 +0800 Subject: [PATCH 056/124] fixed the random seed --- src/caffe/common.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 3e4e0dc0..e12c48c9 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -12,6 +12,7 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { //To fix: for now we use fixed seed to get same result each time +/* int64_t s, seed, pid; FILE* f = fopen("/dev/urandom", "rb"); if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { @@ -29,7 +30,8 @@ int64_t cluster_seedgen(void) { seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); //return seed; LOG(WARNING) << "return fixed seed 37"; - return 37; +*/ + return 37; } @@ -91,18 +93,14 @@ void* Caffe::RNG::generator() { Caffe::Caffe() { -#ifndef CPU_ONLY cl_int err = clblasSetup(); if(err != CL_SUCCESS){ LOG(ERROR) << "clBLAS setup failed "< Date: Sun, 6 Sep 2015 14:00:34 +0800 Subject: [PATCH 057/124] Clean up the last two warnings --- include/caffe/device.hpp | 2 +- ...SEARCH.yugao.log.INFO.20150906-133002.7951 | 1250 +++++++++++++++++ ...SEARCH.yugao.log.INFO.20150906-133358.8300 | 1208 ++++++++++++++++ ...SEARCH.yugao.log.INFO.20150906-133437.8316 | 1208 ++++++++++++++++ ...EARCH.yugao.log.INFO.20150906-135805.16515 | 1160 +++++++++++++++ ...EARCH.yugao.log.INFO.20150906-135855.16537 | 1208 ++++++++++++++++ log/caffe.INFO | 1 + src/caffe/device.cpp | 4 +- src/caffe/solver.cpp | 2 +- src/caffe/syncedmem.cpp | 35 +- 10 files changed, 6044 insertions(+), 34 deletions(-) create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 create mode 120000 log/caffe.INFO diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 697e2391..3806eeb6 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -72,7 +72,7 @@ class Device{ cl_kernel GetKernel(std::string kernel_name); void ReleaseKernels(); }; -extern char* buildOption; +extern std::string buildOption; extern Device amdDevice; } // namespace caffe diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 new file mode 100644 index 00000000..c75e1aaa --- /dev/null +++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 @@ -0,0 +1,1250 @@ +Log file created at: 2015/09/06 13:30:02 +Running on machine: AMD-RESEARCH +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0906 13:30:02.150327 7951 caffe.cpp:114] Use GPU with device ID 0 +I0906 13:30:02.187862 7951 device.cpp:230] Number of platforms found:1 +I0906 13:30:02.187903 7951 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing +I0906 13:30:02.187918 7951 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE +I0906 13:30:02.187973 7951 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) +I0906 13:30:02.187980 7951 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. +I0906 13:30:02.187991 7951 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices +I0906 13:30:02.188000 7951 device.cpp:286] Number of devices found:1 +I0906 13:30:02.188005 7951 device.cpp:288] DeviceID: 0x2171230 +I0906 13:30:02.188025 7951 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU +I0906 13:30:02.188033 7951 device.cpp:393] Is it integrated GPU?: 0 +I0906 13:30:02.188038 7951 device.cpp:393] Max clock frequency MHz: 930 +I0906 13:30:02.188043 7951 device.cpp:393] Host-Device unified mem: 0 +I0906 13:30:02.188048 7951 device.cpp:393] ECC support: 0 +I0906 13:30:02.188052 7951 device.cpp:393] Endian little: 1 +I0906 13:30:02.188056 7951 device.cpp:393] Max compute units: 44 +I0906 13:30:02.188061 7951 device.cpp:393] Max work group size: 256 +I0906 13:30:02.188066 7951 device.cpp:393] Max work item dimensions: 3 +I0906 13:30:02.188072 7951 device.cpp:393] Max work item sizes: 0x100 +I0906 13:30:02.188078 7951 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE +I0906 13:30:02.188083 7951 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL +I0906 13:30:02.188088 7951 device.cpp:393] Max mem alloc size: 4244635648 +I0906 13:30:02.188092 7951 device.cpp:393] Global mem size: 16878927872 +I0906 13:30:02.188097 7951 device.cpp:393] Local mem size: 32768 +I0906 13:30:02.188107 7951 device.cpp:96] Picked device type : GPU 0 +I0906 13:30:04.630481 7951 device.cpp:152] Build Program +I0906 13:30:04.630708 7951 caffe.cpp:122] Starting Optimization +I0906 13:30:04.630797 7951 solver.cpp:40] Initializing solver from parameters: +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +display: 1 +max_iter: 450000 +lr_policy: "step" +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +stepsize: 100000 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU +net: "models/bvlc_alexnet/train_val.prototxt" +I0906 13:30:04.630909 7951 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val.prototxt +I0906 13:30:04.632081 7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data +I0906 13:30:04.632134 7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy +I0906 13:30:04.632319 7951 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TRAIN +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 256 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:30:04.632813 7951 net.cpp:68] Memory required for data: 0 +I0906 13:30:04.632977 7951 layer_factory.hpp:74] Creating layer data +I0906 13:30:04.633033 7951 net.cpp:91] Creating Layer data +I0906 13:30:04.633055 7951 net.cpp:369] data -> data +I0906 13:30:04.633160 7951 net.cpp:369] data -> label +I0906 13:30:04.633183 7951 net.cpp:121] Setting up data +I0906 13:30:04.633196 7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:30:04.642779 7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb +I0906 13:30:04.643064 7951 data_layer.cpp:53] output data size: 256,3,227,227 +I0906 13:30:04.723888 7951 base_data_layer.cpp:43] Initializing prefetch +I0906 13:30:04.724091 7951 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:30:04.724150 7951 net.cpp:128] Top shape: 256 3 227 227 (39574272) +I0906 13:30:04.724161 7951 net.cpp:128] Top shape: 256 (256) +I0906 13:30:04.724165 7951 net.cpp:134] Memory required for data: 158298112 +I0906 13:30:04.724201 7951 layer_factory.hpp:74] Creating layer conv1 +I0906 13:30:04.724283 7951 net.cpp:91] Creating Layer conv1 +I0906 13:30:04.724328 7951 net.cpp:411] conv1 <- data +I0906 13:30:04.724383 7951 net.cpp:369] conv1 -> conv1 +I0906 13:30:04.724417 7951 net.cpp:121] Setting up conv1 +I0906 13:30:04.729287 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) +I0906 13:30:04.729295 7951 net.cpp:134] Memory required for data: 455667712 +I0906 13:30:04.729333 7951 layer_factory.hpp:74] Creating layer relu1 +I0906 13:30:04.729357 7951 net.cpp:91] Creating Layer relu1 +I0906 13:30:04.729362 7951 net.cpp:411] relu1 <- conv1 +I0906 13:30:04.729377 7951 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:30:04.729385 7951 net.cpp:121] Setting up relu1 +I0906 13:30:04.729408 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) +I0906 13:30:04.729411 7951 net.cpp:134] Memory required for data: 753037312 +I0906 13:30:04.729416 7951 layer_factory.hpp:74] Creating layer norm1 +I0906 13:30:04.729444 7951 net.cpp:91] Creating Layer norm1 +I0906 13:30:04.729450 7951 net.cpp:411] norm1 <- conv1 +I0906 13:30:04.729463 7951 net.cpp:369] norm1 -> norm1 +I0906 13:30:04.729476 7951 net.cpp:121] Setting up norm1 +I0906 13:30:04.729499 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) +I0906 13:30:04.729504 7951 net.cpp:134] Memory required for data: 1050406912 +I0906 13:30:04.729509 7951 layer_factory.hpp:74] Creating layer pool1 +I0906 13:30:04.729532 7951 net.cpp:91] Creating Layer pool1 +I0906 13:30:04.729537 7951 net.cpp:411] pool1 <- norm1 +I0906 13:30:04.729550 7951 net.cpp:369] pool1 -> pool1 +I0906 13:30:04.729564 7951 net.cpp:121] Setting up pool1 +I0906 13:30:04.729591 7951 net.cpp:128] Top shape: 256 96 27 27 (17915904) +I0906 13:30:04.729596 7951 net.cpp:134] Memory required for data: 1122070528 +I0906 13:30:04.729600 7951 layer_factory.hpp:74] Creating layer conv2 +I0906 13:30:04.729614 7951 net.cpp:91] Creating Layer conv2 +I0906 13:30:04.729619 7951 net.cpp:411] conv2 <- pool1 +I0906 13:30:04.729635 7951 net.cpp:369] conv2 -> conv2 +I0906 13:30:04.729647 7951 net.cpp:121] Setting up conv2 +I0906 13:30:04.769634 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) +I0906 13:30:04.769649 7951 net.cpp:134] Memory required for data: 1313173504 +I0906 13:30:04.769673 7951 layer_factory.hpp:74] Creating layer relu2 +I0906 13:30:04.769695 7951 net.cpp:91] Creating Layer relu2 +I0906 13:30:04.769704 7951 net.cpp:411] relu2 <- conv2 +I0906 13:30:04.769722 7951 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:30:04.769736 7951 net.cpp:121] Setting up relu2 +I0906 13:30:04.769744 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) +I0906 13:30:04.769748 7951 net.cpp:134] Memory required for data: 1504276480 +I0906 13:30:04.769752 7951 layer_factory.hpp:74] Creating layer norm2 +I0906 13:30:04.769769 7951 net.cpp:91] Creating Layer norm2 +I0906 13:30:04.769775 7951 net.cpp:411] norm2 <- conv2 +I0906 13:30:04.769788 7951 net.cpp:369] norm2 -> norm2 +I0906 13:30:04.769800 7951 net.cpp:121] Setting up norm2 +I0906 13:30:04.769820 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) +I0906 13:30:04.769825 7951 net.cpp:134] Memory required for data: 1695379456 +I0906 13:30:04.769829 7951 layer_factory.hpp:74] Creating layer pool2 +I0906 13:30:04.769850 7951 net.cpp:91] Creating Layer pool2 +I0906 13:30:04.769856 7951 net.cpp:411] pool2 <- norm2 +I0906 13:30:04.769870 7951 net.cpp:369] pool2 -> pool2 +I0906 13:30:04.769927 7951 net.cpp:121] Setting up pool2 +I0906 13:30:04.769944 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) +I0906 13:30:04.769949 7951 net.cpp:134] Memory required for data: 1739681792 +I0906 13:30:04.769953 7951 layer_factory.hpp:74] Creating layer conv3 +I0906 13:30:04.769975 7951 net.cpp:91] Creating Layer conv3 +I0906 13:30:04.769981 7951 net.cpp:411] conv3 <- pool2 +I0906 13:30:04.769996 7951 net.cpp:369] conv3 -> conv3 +I0906 13:30:04.770010 7951 net.cpp:121] Setting up conv3 +I0906 13:30:04.886401 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) +I0906 13:30:04.886425 7951 net.cpp:134] Memory required for data: 1806135296 +I0906 13:30:04.886471 7951 layer_factory.hpp:74] Creating layer relu3 +I0906 13:30:04.886507 7951 net.cpp:91] Creating Layer relu3 +I0906 13:30:04.886521 7951 net.cpp:411] relu3 <- conv3 +I0906 13:30:04.886548 7951 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:30:04.886565 7951 net.cpp:121] Setting up relu3 +I0906 13:30:04.886575 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) +I0906 13:30:04.886579 7951 net.cpp:134] Memory required for data: 1872588800 +I0906 13:30:04.886584 7951 layer_factory.hpp:74] Creating layer conv4 +I0906 13:30:04.886611 7951 net.cpp:91] Creating Layer conv4 +I0906 13:30:04.886617 7951 net.cpp:411] conv4 <- conv3 +I0906 13:30:04.886633 7951 net.cpp:369] conv4 -> conv4 +I0906 13:30:04.886648 7951 net.cpp:121] Setting up conv4 +I0906 13:30:04.973788 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) +I0906 13:30:04.973810 7951 net.cpp:134] Memory required for data: 1939042304 +I0906 13:30:04.973840 7951 layer_factory.hpp:74] Creating layer relu4 +I0906 13:30:04.973875 7951 net.cpp:91] Creating Layer relu4 +I0906 13:30:04.973891 7951 net.cpp:411] relu4 <- conv4 +I0906 13:30:04.973918 7951 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:30:04.973935 7951 net.cpp:121] Setting up relu4 +I0906 13:30:04.973945 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) +I0906 13:30:04.973949 7951 net.cpp:134] Memory required for data: 2005495808 +I0906 13:30:04.973954 7951 layer_factory.hpp:74] Creating layer conv5 +I0906 13:30:04.973980 7951 net.cpp:91] Creating Layer conv5 +I0906 13:30:04.973986 7951 net.cpp:411] conv5 <- conv4 +I0906 13:30:04.974004 7951 net.cpp:369] conv5 -> conv5 +I0906 13:30:04.974019 7951 net.cpp:121] Setting up conv5 +I0906 13:30:05.032649 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) +I0906 13:30:05.032670 7951 net.cpp:134] Memory required for data: 2049798144 +I0906 13:30:05.032712 7951 layer_factory.hpp:74] Creating layer relu5 +I0906 13:30:05.032747 7951 net.cpp:91] Creating Layer relu5 +I0906 13:30:05.032763 7951 net.cpp:411] relu5 <- conv5 +I0906 13:30:05.032788 7951 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:30:05.032805 7951 net.cpp:121] Setting up relu5 +I0906 13:30:05.032814 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) +I0906 13:30:05.032819 7951 net.cpp:134] Memory required for data: 2094100480 +I0906 13:30:05.032824 7951 layer_factory.hpp:74] Creating layer pool5 +I0906 13:30:05.032843 7951 net.cpp:91] Creating Layer pool5 +I0906 13:30:05.032850 7951 net.cpp:411] pool5 <- conv5 +I0906 13:30:05.032863 7951 net.cpp:369] pool5 -> pool5 +I0906 13:30:05.032877 7951 net.cpp:121] Setting up pool5 +I0906 13:30:05.032897 7951 net.cpp:128] Top shape: 256 256 6 6 (2359296) +I0906 13:30:05.032902 7951 net.cpp:134] Memory required for data: 2103537664 +I0906 13:30:05.032907 7951 layer_factory.hpp:74] Creating layer fc6 +I0906 13:30:05.032945 7951 net.cpp:91] Creating Layer fc6 +I0906 13:30:05.032951 7951 net.cpp:411] fc6 <- pool5 +I0906 13:30:05.032966 7951 net.cpp:369] fc6 -> fc6 +I0906 13:30:05.032980 7951 net.cpp:121] Setting up fc6 +I0906 13:30:05.203193 7955 data_layer.cpp:120] Prefetch batch: 478 ms. +I0906 13:30:05.203241 7955 data_layer.cpp:121] Read time: 65.301 ms. +I0906 13:30:05.203250 7955 data_layer.cpp:122] Transform time: 409.394 ms. +I0906 13:30:09.817406 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:09.817432 7951 net.cpp:134] Memory required for data: 2107731968 +I0906 13:30:09.817504 7951 layer_factory.hpp:74] Creating layer relu6 +I0906 13:30:09.817538 7951 net.cpp:91] Creating Layer relu6 +I0906 13:30:09.817553 7951 net.cpp:411] relu6 <- fc6 +I0906 13:30:09.817579 7951 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:30:09.817595 7951 net.cpp:121] Setting up relu6 +I0906 13:30:09.817605 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:09.817608 7951 net.cpp:134] Memory required for data: 2111926272 +I0906 13:30:09.817613 7951 layer_factory.hpp:74] Creating layer drop6 +I0906 13:30:09.817643 7951 net.cpp:91] Creating Layer drop6 +I0906 13:30:09.817649 7951 net.cpp:411] drop6 <- fc6 +I0906 13:30:09.817662 7951 net.cpp:358] drop6 -> fc6 (in-place) +I0906 13:30:09.817672 7951 net.cpp:121] Setting up drop6 +I0906 13:30:09.817692 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:09.817695 7951 net.cpp:134] Memory required for data: 2116120576 +I0906 13:30:09.817700 7951 layer_factory.hpp:74] Creating layer fc7 +I0906 13:30:09.817721 7951 net.cpp:91] Creating Layer fc7 +I0906 13:30:09.817728 7951 net.cpp:411] fc7 <- fc6 +I0906 13:30:09.817744 7951 net.cpp:369] fc7 -> fc7 +I0906 13:30:09.817759 7951 net.cpp:121] Setting up fc7 +I0906 13:30:11.938176 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:11.938201 7951 net.cpp:134] Memory required for data: 2120314880 +I0906 13:30:11.938230 7951 layer_factory.hpp:74] Creating layer relu7 +I0906 13:30:11.938263 7951 net.cpp:91] Creating Layer relu7 +I0906 13:30:11.938278 7951 net.cpp:411] relu7 <- fc7 +I0906 13:30:11.938305 7951 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:30:11.938321 7951 net.cpp:121] Setting up relu7 +I0906 13:30:11.938330 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:11.938334 7951 net.cpp:134] Memory required for data: 2124509184 +I0906 13:30:11.938339 7951 layer_factory.hpp:74] Creating layer drop7 +I0906 13:30:11.938355 7951 net.cpp:91] Creating Layer drop7 +I0906 13:30:11.938360 7951 net.cpp:411] drop7 <- fc7 +I0906 13:30:11.938372 7951 net.cpp:358] drop7 -> fc7 (in-place) +I0906 13:30:11.938382 7951 net.cpp:121] Setting up drop7 +I0906 13:30:11.938397 7951 net.cpp:128] Top shape: 256 4096 (1048576) +I0906 13:30:11.938401 7951 net.cpp:134] Memory required for data: 2128703488 +I0906 13:30:11.938406 7951 layer_factory.hpp:74] Creating layer fc8 +I0906 13:30:11.938427 7951 net.cpp:91] Creating Layer fc8 +I0906 13:30:11.938433 7951 net.cpp:411] fc8 <- fc7 +I0906 13:30:11.938449 7951 net.cpp:369] fc8 -> fc8 +I0906 13:30:11.938464 7951 net.cpp:121] Setting up fc8 +I0906 13:30:12.468230 7951 net.cpp:128] Top shape: 256 1000 (256000) +I0906 13:30:12.468251 7951 net.cpp:134] Memory required for data: 2129727488 +I0906 13:30:12.468279 7951 layer_factory.hpp:74] Creating layer loss +I0906 13:30:12.468333 7951 net.cpp:91] Creating Layer loss +I0906 13:30:12.468348 7951 net.cpp:411] loss <- fc8 +I0906 13:30:12.468370 7951 net.cpp:411] loss <- label +I0906 13:30:12.468389 7951 net.cpp:369] loss -> loss +I0906 13:30:12.468408 7951 net.cpp:121] Setting up loss +I0906 13:30:12.468426 7951 layer_factory.hpp:74] Creating layer loss +I0906 13:30:12.469732 7951 net.cpp:128] Top shape: (1) +I0906 13:30:12.469740 7951 net.cpp:130] with loss weight 1 +I0906 13:30:12.469756 7951 net.cpp:134] Memory required for data: 2129727492 +I0906 13:30:12.469769 7951 net.cpp:193] loss needs backward computation. +I0906 13:30:12.469779 7951 net.cpp:193] fc8 needs backward computation. +I0906 13:30:12.469784 7951 net.cpp:193] drop7 needs backward computation. +I0906 13:30:12.469791 7951 net.cpp:193] relu7 needs backward computation. +I0906 13:30:12.469796 7951 net.cpp:193] fc7 needs backward computation. +I0906 13:30:12.469808 7951 net.cpp:193] drop6 needs backward computation. +I0906 13:30:12.469815 7951 net.cpp:193] relu6 needs backward computation. +I0906 13:30:12.469820 7951 net.cpp:193] fc6 needs backward computation. +I0906 13:30:12.469825 7951 net.cpp:193] pool5 needs backward computation. +I0906 13:30:12.469830 7951 net.cpp:193] relu5 needs backward computation. +I0906 13:30:12.469835 7951 net.cpp:193] conv5 needs backward computation. +I0906 13:30:12.469882 7951 net.cpp:193] relu4 needs backward computation. +I0906 13:30:12.469887 7951 net.cpp:193] conv4 needs backward computation. +I0906 13:30:12.469893 7951 net.cpp:193] relu3 needs backward computation. +I0906 13:30:12.469899 7951 net.cpp:193] conv3 needs backward computation. +I0906 13:30:12.469907 7951 net.cpp:193] pool2 needs backward computation. +I0906 13:30:12.469913 7951 net.cpp:193] norm2 needs backward computation. +I0906 13:30:12.469918 7951 net.cpp:193] relu2 needs backward computation. +I0906 13:30:12.469924 7951 net.cpp:193] conv2 needs backward computation. +I0906 13:30:12.469930 7951 net.cpp:193] pool1 needs backward computation. +I0906 13:30:12.469936 7951 net.cpp:193] norm1 needs backward computation. +I0906 13:30:12.469943 7951 net.cpp:193] relu1 needs backward computation. +I0906 13:30:12.469949 7951 net.cpp:193] conv1 needs backward computation. +I0906 13:30:12.469955 7951 net.cpp:195] data does not need backward computation. +I0906 13:30:12.469962 7951 net.cpp:236] This network produces output loss +I0906 13:30:12.470002 7951 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:30:12.470018 7951 net.cpp:248] Network initialization done. +I0906 13:30:12.470022 7951 net.cpp:249] Memory required for data: 2129727492 +I0906 13:30:12.470949 7951 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val.prototxt +I0906 13:30:12.471081 7951 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data +I0906 13:30:12.471318 7951 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TEST +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:30:12.471688 7951 net.cpp:68] Memory required for data: 0 +I0906 13:30:12.471739 7951 layer_factory.hpp:74] Creating layer data +I0906 13:30:12.471761 7951 net.cpp:91] Creating Layer data +I0906 13:30:12.471772 7951 net.cpp:369] data -> data +I0906 13:30:12.471796 7951 net.cpp:369] data -> label +I0906 13:30:12.471810 7951 net.cpp:121] Setting up data +I0906 13:30:12.471817 7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:30:12.482815 7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb +I0906 13:30:12.483065 7951 data_layer.cpp:53] output data size: 50,3,227,227 +I0906 13:30:12.546061 7951 base_data_layer.cpp:43] Initializing prefetch +I0906 13:30:12.546188 7951 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:30:12.546222 7951 net.cpp:128] Top shape: 50 3 227 227 (7729350) +I0906 13:30:12.546231 7951 net.cpp:128] Top shape: 50 (50) +I0906 13:30:12.546236 7951 net.cpp:134] Memory required for data: 30917600 +I0906 13:30:12.546268 7951 layer_factory.hpp:74] Creating layer label_data_1_split +I0906 13:30:12.546334 7951 net.cpp:91] Creating Layer label_data_1_split +I0906 13:30:12.546380 7951 net.cpp:411] label_data_1_split <- label +I0906 13:30:12.546419 7951 net.cpp:369] label_data_1_split -> label_data_1_split_0 +I0906 13:30:12.546460 7951 net.cpp:369] label_data_1_split -> label_data_1_split_1 +I0906 13:30:12.546520 7951 net.cpp:121] Setting up label_data_1_split +I0906 13:30:12.546551 7951 net.cpp:128] Top shape: 50 (50) +I0906 13:30:12.546558 7951 net.cpp:128] Top shape: 50 (50) +I0906 13:30:12.546561 7951 net.cpp:134] Memory required for data: 30918000 +I0906 13:30:12.546567 7951 layer_factory.hpp:74] Creating layer conv1 +I0906 13:30:12.546602 7951 net.cpp:91] Creating Layer conv1 +I0906 13:30:12.546608 7951 net.cpp:411] conv1 <- data +I0906 13:30:12.546624 7951 net.cpp:369] conv1 -> conv1 +I0906 13:30:12.546638 7951 net.cpp:121] Setting up conv1 +I0906 13:30:12.551349 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:30:12.551354 7951 net.cpp:134] Memory required for data: 88998000 +I0906 13:30:12.551374 7951 layer_factory.hpp:74] Creating layer relu1 +I0906 13:30:12.551388 7951 net.cpp:91] Creating Layer relu1 +I0906 13:30:12.551393 7951 net.cpp:411] relu1 <- conv1 +I0906 13:30:12.551405 7951 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:30:12.551415 7951 net.cpp:121] Setting up relu1 +I0906 13:30:12.551422 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:30:12.551426 7951 net.cpp:134] Memory required for data: 147078000 +I0906 13:30:12.551431 7951 layer_factory.hpp:74] Creating layer norm1 +I0906 13:30:12.551451 7951 net.cpp:91] Creating Layer norm1 +I0906 13:30:12.551457 7951 net.cpp:411] norm1 <- conv1 +I0906 13:30:12.551470 7951 net.cpp:369] norm1 -> norm1 +I0906 13:30:12.551481 7951 net.cpp:121] Setting up norm1 +I0906 13:30:12.551499 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:30:12.551504 7951 net.cpp:134] Memory required for data: 205158000 +I0906 13:30:12.551508 7951 layer_factory.hpp:74] Creating layer pool1 +I0906 13:30:12.551524 7951 net.cpp:91] Creating Layer pool1 +I0906 13:30:12.551530 7951 net.cpp:411] pool1 <- norm1 +I0906 13:30:12.551543 7951 net.cpp:369] pool1 -> pool1 +I0906 13:30:12.551553 7951 net.cpp:121] Setting up pool1 +I0906 13:30:12.551571 7951 net.cpp:128] Top shape: 50 96 27 27 (3499200) +I0906 13:30:12.551576 7951 net.cpp:134] Memory required for data: 219154800 +I0906 13:30:12.551580 7951 layer_factory.hpp:74] Creating layer conv2 +I0906 13:30:12.551594 7951 net.cpp:91] Creating Layer conv2 +I0906 13:30:12.551600 7951 net.cpp:411] conv2 <- pool1 +I0906 13:30:12.551615 7951 net.cpp:369] conv2 -> conv2 +I0906 13:30:12.551627 7951 net.cpp:121] Setting up conv2 +I0906 13:30:12.591382 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:30:12.591404 7951 net.cpp:134] Memory required for data: 256479600 +I0906 13:30:12.591442 7951 layer_factory.hpp:74] Creating layer relu2 +I0906 13:30:12.591473 7951 net.cpp:91] Creating Layer relu2 +I0906 13:30:12.591486 7951 net.cpp:411] relu2 <- conv2 +I0906 13:30:12.591511 7951 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:30:12.591526 7951 net.cpp:121] Setting up relu2 +I0906 13:30:12.591536 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:30:12.591539 7951 net.cpp:134] Memory required for data: 293804400 +I0906 13:30:12.591544 7951 layer_factory.hpp:74] Creating layer norm2 +I0906 13:30:12.591572 7951 net.cpp:91] Creating Layer norm2 +I0906 13:30:12.591578 7951 net.cpp:411] norm2 <- conv2 +I0906 13:30:12.591591 7951 net.cpp:369] norm2 -> norm2 +I0906 13:30:12.591609 7951 net.cpp:121] Setting up norm2 +I0906 13:30:12.591629 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:30:12.591634 7951 net.cpp:134] Memory required for data: 331129200 +I0906 13:30:12.591639 7951 layer_factory.hpp:74] Creating layer pool2 +I0906 13:30:12.591657 7951 net.cpp:91] Creating Layer pool2 +I0906 13:30:12.591663 7951 net.cpp:411] pool2 <- norm2 +I0906 13:30:12.591676 7951 net.cpp:369] pool2 -> pool2 +I0906 13:30:12.591687 7951 net.cpp:121] Setting up pool2 +I0906 13:30:12.591706 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:30:12.591709 7951 net.cpp:134] Memory required for data: 339782000 +I0906 13:30:12.591714 7951 layer_factory.hpp:74] Creating layer conv3 +I0906 13:30:12.591739 7951 net.cpp:91] Creating Layer conv3 +I0906 13:30:12.591744 7951 net.cpp:411] conv3 <- pool2 +I0906 13:30:12.591802 7951 net.cpp:369] conv3 -> conv3 +I0906 13:30:12.591814 7951 net.cpp:121] Setting up conv3 +I0906 13:30:12.640625 7956 data_layer.cpp:120] Prefetch batch: 94 ms. +I0906 13:30:12.640658 7956 data_layer.cpp:121] Read time: 12.07 ms. +I0906 13:30:12.640666 7956 data_layer.cpp:122] Transform time: 81.163 ms. +I0906 13:30:12.705313 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:30:12.705337 7951 net.cpp:134] Memory required for data: 352761200 +I0906 13:30:12.705377 7951 layer_factory.hpp:74] Creating layer relu3 +I0906 13:30:12.705410 7951 net.cpp:91] Creating Layer relu3 +I0906 13:30:12.705425 7951 net.cpp:411] relu3 <- conv3 +I0906 13:30:12.705451 7951 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:30:12.705466 7951 net.cpp:121] Setting up relu3 +I0906 13:30:12.705476 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:30:12.705479 7951 net.cpp:134] Memory required for data: 365740400 +I0906 13:30:12.705484 7951 layer_factory.hpp:74] Creating layer conv4 +I0906 13:30:12.705512 7951 net.cpp:91] Creating Layer conv4 +I0906 13:30:12.705518 7951 net.cpp:411] conv4 <- conv3 +I0906 13:30:12.705534 7951 net.cpp:369] conv4 -> conv4 +I0906 13:30:12.705549 7951 net.cpp:121] Setting up conv4 +I0906 13:30:12.789549 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:30:12.789571 7951 net.cpp:134] Memory required for data: 378719600 +I0906 13:30:12.789597 7951 layer_factory.hpp:74] Creating layer relu4 +I0906 13:30:12.789631 7951 net.cpp:91] Creating Layer relu4 +I0906 13:30:12.789646 7951 net.cpp:411] relu4 <- conv4 +I0906 13:30:12.789674 7951 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:30:12.789690 7951 net.cpp:121] Setting up relu4 +I0906 13:30:12.789698 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:30:12.789701 7951 net.cpp:134] Memory required for data: 391698800 +I0906 13:30:12.789706 7951 layer_factory.hpp:74] Creating layer conv5 +I0906 13:30:12.789732 7951 net.cpp:91] Creating Layer conv5 +I0906 13:30:12.789738 7951 net.cpp:411] conv5 <- conv4 +I0906 13:30:12.789754 7951 net.cpp:369] conv5 -> conv5 +I0906 13:30:12.789770 7951 net.cpp:121] Setting up conv5 +I0906 13:30:12.846217 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:30:12.846233 7951 net.cpp:134] Memory required for data: 400351600 +I0906 13:30:12.846271 7951 layer_factory.hpp:74] Creating layer relu5 +I0906 13:30:12.846298 7951 net.cpp:91] Creating Layer relu5 +I0906 13:30:12.846312 7951 net.cpp:411] relu5 <- conv5 +I0906 13:30:12.846335 7951 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:30:12.846350 7951 net.cpp:121] Setting up relu5 +I0906 13:30:12.846359 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:30:12.846362 7951 net.cpp:134] Memory required for data: 409004400 +I0906 13:30:12.846367 7951 layer_factory.hpp:74] Creating layer pool5 +I0906 13:30:12.846397 7951 net.cpp:91] Creating Layer pool5 +I0906 13:30:12.846402 7951 net.cpp:411] pool5 <- conv5 +I0906 13:30:12.846417 7951 net.cpp:369] pool5 -> pool5 +I0906 13:30:12.846431 7951 net.cpp:121] Setting up pool5 +I0906 13:30:12.846451 7951 net.cpp:128] Top shape: 50 256 6 6 (460800) +I0906 13:30:12.846454 7951 net.cpp:134] Memory required for data: 410847600 +I0906 13:30:12.846459 7951 layer_factory.hpp:74] Creating layer fc6 +I0906 13:30:12.846479 7951 net.cpp:91] Creating Layer fc6 +I0906 13:30:12.846485 7951 net.cpp:411] fc6 <- pool5 +I0906 13:30:12.846499 7951 net.cpp:369] fc6 -> fc6 +I0906 13:30:12.846513 7951 net.cpp:121] Setting up fc6 +I0906 13:30:17.661206 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:17.661231 7951 net.cpp:134] Memory required for data: 411666800 +I0906 13:30:17.661259 7951 layer_factory.hpp:74] Creating layer relu6 +I0906 13:30:17.661293 7951 net.cpp:91] Creating Layer relu6 +I0906 13:30:17.661309 7951 net.cpp:411] relu6 <- fc6 +I0906 13:30:17.661334 7951 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:30:17.661350 7951 net.cpp:121] Setting up relu6 +I0906 13:30:17.661360 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:17.661363 7951 net.cpp:134] Memory required for data: 412486000 +I0906 13:30:17.661412 7951 layer_factory.hpp:74] Creating layer drop6 +I0906 13:30:17.661428 7951 net.cpp:91] Creating Layer drop6 +I0906 13:30:17.661434 7951 net.cpp:411] drop6 <- fc6 +I0906 13:30:17.661447 7951 net.cpp:358] drop6 -> fc6 (in-place) +I0906 13:30:17.661456 7951 net.cpp:121] Setting up drop6 +I0906 13:30:17.661470 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:17.661475 7951 net.cpp:134] Memory required for data: 413305200 +I0906 13:30:17.661480 7951 layer_factory.hpp:74] Creating layer fc7 +I0906 13:30:17.661501 7951 net.cpp:91] Creating Layer fc7 +I0906 13:30:17.661507 7951 net.cpp:411] fc7 <- fc6 +I0906 13:30:17.661523 7951 net.cpp:369] fc7 -> fc7 +I0906 13:30:17.661540 7951 net.cpp:121] Setting up fc7 +I0906 13:30:19.790464 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:19.790488 7951 net.cpp:134] Memory required for data: 414124400 +I0906 13:30:19.790514 7951 layer_factory.hpp:74] Creating layer relu7 +I0906 13:30:19.790547 7951 net.cpp:91] Creating Layer relu7 +I0906 13:30:19.790563 7951 net.cpp:411] relu7 <- fc7 +I0906 13:30:19.790591 7951 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:30:19.790607 7951 net.cpp:121] Setting up relu7 +I0906 13:30:19.790616 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:19.790621 7951 net.cpp:134] Memory required for data: 414943600 +I0906 13:30:19.790624 7951 layer_factory.hpp:74] Creating layer drop7 +I0906 13:30:19.790639 7951 net.cpp:91] Creating Layer drop7 +I0906 13:30:19.790645 7951 net.cpp:411] drop7 <- fc7 +I0906 13:30:19.790657 7951 net.cpp:358] drop7 -> fc7 (in-place) +I0906 13:30:19.790668 7951 net.cpp:121] Setting up drop7 +I0906 13:30:19.790683 7951 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:30:19.790688 7951 net.cpp:134] Memory required for data: 415762800 +I0906 13:30:19.790691 7951 layer_factory.hpp:74] Creating layer fc8 +I0906 13:30:19.790714 7951 net.cpp:91] Creating Layer fc8 +I0906 13:30:19.790719 7951 net.cpp:411] fc8 <- fc7 +I0906 13:30:19.790735 7951 net.cpp:369] fc8 -> fc8 +I0906 13:30:19.790760 7951 net.cpp:121] Setting up fc8 +I0906 13:30:20.310474 7951 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:30:20.310497 7951 net.cpp:134] Memory required for data: 415962800 +I0906 13:30:20.310523 7951 layer_factory.hpp:74] Creating layer fc8_fc8_0_split +I0906 13:30:20.310555 7951 net.cpp:91] Creating Layer fc8_fc8_0_split +I0906 13:30:20.310570 7951 net.cpp:411] fc8_fc8_0_split <- fc8 +I0906 13:30:20.310598 7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 +I0906 13:30:20.310621 7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 +I0906 13:30:20.310633 7951 net.cpp:121] Setting up fc8_fc8_0_split +I0906 13:30:20.310650 7951 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:30:20.310657 7951 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:30:20.310660 7951 net.cpp:134] Memory required for data: 416362800 +I0906 13:30:20.310665 7951 layer_factory.hpp:74] Creating layer accuracy +I0906 13:30:20.310698 7951 net.cpp:91] Creating Layer accuracy +I0906 13:30:20.310704 7951 net.cpp:411] accuracy <- fc8_fc8_0_split_0 +I0906 13:30:20.310715 7951 net.cpp:411] accuracy <- label_data_1_split_0 +I0906 13:30:20.310729 7951 net.cpp:369] accuracy -> accuracy +I0906 13:30:20.310740 7951 net.cpp:121] Setting up accuracy +I0906 13:30:20.310756 7951 net.cpp:128] Top shape: (1) +I0906 13:30:20.310760 7951 net.cpp:134] Memory required for data: 416362804 +I0906 13:30:20.310765 7951 layer_factory.hpp:74] Creating layer loss +I0906 13:30:20.310777 7951 net.cpp:91] Creating Layer loss +I0906 13:30:20.310782 7951 net.cpp:411] loss <- fc8_fc8_0_split_1 +I0906 13:30:20.310793 7951 net.cpp:411] loss <- label_data_1_split_1 +I0906 13:30:20.310804 7951 net.cpp:369] loss -> loss +I0906 13:30:20.310816 7951 net.cpp:121] Setting up loss +I0906 13:30:20.310825 7951 layer_factory.hpp:74] Creating layer loss +I0906 13:30:20.311178 7951 net.cpp:128] Top shape: (1) +I0906 13:30:20.311183 7951 net.cpp:130] with loss weight 1 +I0906 13:30:20.311200 7951 net.cpp:134] Memory required for data: 416362808 +I0906 13:30:20.311250 7951 net.cpp:193] loss needs backward computation. +I0906 13:30:20.311259 7951 net.cpp:195] accuracy does not need backward computation. +I0906 13:30:20.311265 7951 net.cpp:193] fc8_fc8_0_split needs backward computation. +I0906 13:30:20.311271 7951 net.cpp:193] fc8 needs backward computation. +I0906 13:30:20.311277 7951 net.cpp:193] drop7 needs backward computation. +I0906 13:30:20.311282 7951 net.cpp:193] relu7 needs backward computation. +I0906 13:30:20.311288 7951 net.cpp:193] fc7 needs backward computation. +I0906 13:30:20.311295 7951 net.cpp:193] drop6 needs backward computation. +I0906 13:30:20.311300 7951 net.cpp:193] relu6 needs backward computation. +I0906 13:30:20.311305 7951 net.cpp:193] fc6 needs backward computation. +I0906 13:30:20.311311 7951 net.cpp:193] pool5 needs backward computation. +I0906 13:30:20.311317 7951 net.cpp:193] relu5 needs backward computation. +I0906 13:30:20.311322 7951 net.cpp:193] conv5 needs backward computation. +I0906 13:30:20.311328 7951 net.cpp:193] relu4 needs backward computation. +I0906 13:30:20.311333 7951 net.cpp:193] conv4 needs backward computation. +I0906 13:30:20.311339 7951 net.cpp:193] relu3 needs backward computation. +I0906 13:30:20.311345 7951 net.cpp:193] conv3 needs backward computation. +I0906 13:30:20.311352 7951 net.cpp:193] pool2 needs backward computation. +I0906 13:30:20.311357 7951 net.cpp:193] norm2 needs backward computation. +I0906 13:30:20.311363 7951 net.cpp:193] relu2 needs backward computation. +I0906 13:30:20.311368 7951 net.cpp:193] conv2 needs backward computation. +I0906 13:30:20.311374 7951 net.cpp:193] pool1 needs backward computation. +I0906 13:30:20.311380 7951 net.cpp:193] norm1 needs backward computation. +I0906 13:30:20.311386 7951 net.cpp:193] relu1 needs backward computation. +I0906 13:30:20.311391 7951 net.cpp:193] conv1 needs backward computation. +I0906 13:30:20.311399 7951 net.cpp:195] label_data_1_split does not need backward computation. +I0906 13:30:20.311406 7951 net.cpp:195] data does not need backward computation. +I0906 13:30:20.311411 7951 net.cpp:236] This network produces output accuracy +I0906 13:30:20.311419 7951 net.cpp:236] This network produces output loss +I0906 13:30:20.311455 7951 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:30:20.311468 7951 net.cpp:248] Network initialization done. +I0906 13:30:20.311472 7951 net.cpp:249] Memory required for data: 416362808 +I0906 13:30:20.311663 7951 solver.cpp:53] Solver scaffolding done. +I0906 13:30:20.311787 7951 solver.cpp:270] Solving AlexNet +I0906 13:30:20.311791 7951 solver.cpp:271] Learning Rate Policy: step +I0906 13:30:20.313592 7951 solver.cpp:314] Iteration 0, Testing net (#0) +I0906 13:30:20.313630 7951 net.cpp:696] Copying source layer data +I0906 13:30:20.313635 7951 net.cpp:696] Copying source layer conv1 +I0906 13:30:20.316704 7951 net.cpp:696] Copying source layer relu1 +I0906 13:30:20.316743 7951 net.cpp:696] Copying source layer norm1 +I0906 13:30:20.316756 7951 net.cpp:696] Copying source layer pool1 +I0906 13:30:20.316766 7951 net.cpp:696] Copying source layer conv2 +I0906 13:30:20.317158 7951 net.cpp:696] Copying source layer relu2 +I0906 13:30:20.317173 7951 net.cpp:696] Copying source layer norm2 +I0906 13:30:20.317183 7951 net.cpp:696] Copying source layer pool2 +I0906 13:30:20.317193 7951 net.cpp:696] Copying source layer conv3 +I0906 13:30:20.317970 7951 net.cpp:696] Copying source layer relu3 +I0906 13:30:20.317983 7951 net.cpp:696] Copying source layer conv4 +I0906 13:30:20.318357 7951 net.cpp:696] Copying source layer relu4 +I0906 13:30:20.318372 7951 net.cpp:696] Copying source layer conv5 +I0906 13:30:20.318827 7951 net.cpp:696] Copying source layer relu5 +I0906 13:30:20.318840 7951 net.cpp:696] Copying source layer pool5 +I0906 13:30:20.318850 7951 net.cpp:696] Copying source layer fc6 +I0906 13:30:20.336436 7951 net.cpp:696] Copying source layer relu6 +I0906 13:30:20.336460 7951 net.cpp:696] Copying source layer drop6 +I0906 13:30:20.336467 7951 net.cpp:696] Copying sou \ No newline at end of file diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 new file mode 100644 index 00000000..b99da3d4 --- /dev/null +++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 @@ -0,0 +1,1208 @@ +Log file created at: 2015/09/06 13:33:58 +Running on machine: AMD-RESEARCH +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0906 13:33:58.858449 8300 caffe.cpp:114] Use GPU with device ID 0 +I0906 13:33:58.896994 8300 device.cpp:230] Number of platforms found:1 +I0906 13:33:58.897037 8300 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing +I0906 13:33:58.897054 8300 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE +I0906 13:33:58.897061 8300 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) +I0906 13:33:58.897068 8300 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. +I0906 13:33:58.897075 8300 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices +I0906 13:33:58.897086 8300 device.cpp:286] Number of devices found:1 +I0906 13:33:58.897092 8300 device.cpp:288] DeviceID: 0x163a250 +I0906 13:33:58.897126 8300 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU +I0906 13:33:58.897137 8300 device.cpp:393] Is it integrated GPU?: 0 +I0906 13:33:58.897145 8300 device.cpp:393] Max clock frequency MHz: 930 +I0906 13:33:58.897151 8300 device.cpp:393] Host-Device unified mem: 0 +I0906 13:33:58.897157 8300 device.cpp:393] ECC support: 0 +I0906 13:33:58.897164 8300 device.cpp:393] Endian little: 1 +I0906 13:33:58.897171 8300 device.cpp:393] Max compute units: 44 +I0906 13:33:58.897177 8300 device.cpp:393] Max work group size: 256 +I0906 13:33:58.897186 8300 device.cpp:393] Max work item dimensions: 3 +I0906 13:33:58.897192 8300 device.cpp:393] Max work item sizes: 0x100 +I0906 13:33:58.897202 8300 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE +I0906 13:33:58.897209 8300 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL +I0906 13:33:58.897215 8300 device.cpp:393] Max mem alloc size: 4244635648 +I0906 13:33:58.897222 8300 device.cpp:393] Global mem size: 16878927872 +I0906 13:33:58.897228 8300 device.cpp:393] Local mem size: 32768 +I0906 13:33:58.897241 8300 device.cpp:96] Picked device type : GPU 0 +I0906 13:34:01.301823 8300 device.cpp:152] Build Program +I0906 13:34:01.302049 8300 caffe.cpp:122] Starting Optimization +I0906 13:34:01.302139 8300 solver.cpp:40] Initializing solver from parameters: +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +display: 1 +max_iter: 10 +lr_policy: "step" +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +stepsize: 100000 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +I0906 13:34:01.302249 8300 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:34:01.303269 8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data +I0906 13:34:01.303316 8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy +I0906 13:34:01.303493 8300 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TRAIN +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 100 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:34:01.303913 8300 net.cpp:68] Memory required for data: 0 +I0906 13:34:01.304132 8300 layer_factory.hpp:74] Creating layer data +I0906 13:34:01.304185 8300 net.cpp:91] Creating Layer data +I0906 13:34:01.304205 8300 net.cpp:369] data -> data +I0906 13:34:01.304306 8300 net.cpp:369] data -> label +I0906 13:34:01.304328 8300 net.cpp:121] Setting up data +I0906 13:34:01.304342 8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:34:01.318087 8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb +I0906 13:34:01.318596 8300 data_layer.cpp:53] output data size: 100,3,227,227 +I0906 13:34:01.351816 8300 base_data_layer.cpp:43] Initializing prefetch +I0906 13:34:01.352555 8300 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:34:01.352643 8300 net.cpp:128] Top shape: 100 3 227 227 (15458700) +I0906 13:34:01.352655 8300 net.cpp:128] Top shape: 100 (100) +I0906 13:34:01.352660 8300 net.cpp:134] Memory required for data: 61835200 +I0906 13:34:01.352697 8300 layer_factory.hpp:74] Creating layer conv1 +I0906 13:34:01.352783 8300 net.cpp:91] Creating Layer conv1 +I0906 13:34:01.352808 8300 net.cpp:411] conv1 <- data +I0906 13:34:01.352902 8300 net.cpp:369] conv1 -> conv1 +I0906 13:34:01.352937 8300 net.cpp:121] Setting up conv1 +I0906 13:34:01.357744 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:01.357751 8300 net.cpp:134] Memory required for data: 177995200 +I0906 13:34:01.357791 8300 layer_factory.hpp:74] Creating layer relu1 +I0906 13:34:01.357815 8300 net.cpp:91] Creating Layer relu1 +I0906 13:34:01.357820 8300 net.cpp:411] relu1 <- conv1 +I0906 13:34:01.357833 8300 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:34:01.357843 8300 net.cpp:121] Setting up relu1 +I0906 13:34:01.357851 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:01.357856 8300 net.cpp:134] Memory required for data: 294155200 +I0906 13:34:01.357861 8300 layer_factory.hpp:74] Creating layer norm1 +I0906 13:34:01.357890 8300 net.cpp:91] Creating Layer norm1 +I0906 13:34:01.357895 8300 net.cpp:411] norm1 <- conv1 +I0906 13:34:01.357908 8300 net.cpp:369] norm1 -> norm1 +I0906 13:34:01.357920 8300 net.cpp:121] Setting up norm1 +I0906 13:34:01.357944 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:01.357949 8300 net.cpp:134] Memory required for data: 410315200 +I0906 13:34:01.357954 8300 layer_factory.hpp:74] Creating layer pool1 +I0906 13:34:01.357978 8300 net.cpp:91] Creating Layer pool1 +I0906 13:34:01.357985 8300 net.cpp:411] pool1 <- norm1 +I0906 13:34:01.357996 8300 net.cpp:369] pool1 -> pool1 +I0906 13:34:01.358010 8300 net.cpp:121] Setting up pool1 +I0906 13:34:01.358038 8300 net.cpp:128] Top shape: 100 96 27 27 (6998400) +I0906 13:34:01.358042 8300 net.cpp:134] Memory required for data: 438308800 +I0906 13:34:01.358047 8300 layer_factory.hpp:74] Creating layer conv2 +I0906 13:34:01.358060 8300 net.cpp:91] Creating Layer conv2 +I0906 13:34:01.358067 8300 net.cpp:411] conv2 <- pool1 +I0906 13:34:01.358079 8300 net.cpp:369] conv2 -> conv2 +I0906 13:34:01.358091 8300 net.cpp:121] Setting up conv2 +I0906 13:34:01.397493 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:01.397511 8300 net.cpp:134] Memory required for data: 512958400 +I0906 13:34:01.397541 8300 layer_factory.hpp:74] Creating layer relu2 +I0906 13:34:01.397567 8300 net.cpp:91] Creating Layer relu2 +I0906 13:34:01.397578 8300 net.cpp:411] relu2 <- conv2 +I0906 13:34:01.397599 8300 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:34:01.397613 8300 net.cpp:121] Setting up relu2 +I0906 13:34:01.397621 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:01.397626 8300 net.cpp:134] Memory required for data: 587608000 +I0906 13:34:01.397631 8300 layer_factory.hpp:74] Creating layer norm2 +I0906 13:34:01.397649 8300 net.cpp:91] Creating Layer norm2 +I0906 13:34:01.397655 8300 net.cpp:411] norm2 <- conv2 +I0906 13:34:01.397667 8300 net.cpp:369] norm2 -> norm2 +I0906 13:34:01.397680 8300 net.cpp:121] Setting up norm2 +I0906 13:34:01.397699 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:01.397704 8300 net.cpp:134] Memory required for data: 662257600 +I0906 13:34:01.397709 8300 layer_factory.hpp:74] Creating layer pool2 +I0906 13:34:01.397729 8300 net.cpp:91] Creating Layer pool2 +I0906 13:34:01.397735 8300 net.cpp:411] pool2 <- norm2 +I0906 13:34:01.397748 8300 net.cpp:369] pool2 -> pool2 +I0906 13:34:01.397758 8300 net.cpp:121] Setting up pool2 +I0906 13:34:01.397776 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:01.397780 8300 net.cpp:134] Memory required for data: 679563200 +I0906 13:34:01.397830 8300 layer_factory.hpp:74] Creating layer conv3 +I0906 13:34:01.397851 8300 net.cpp:91] Creating Layer conv3 +I0906 13:34:01.397857 8300 net.cpp:411] conv3 <- pool2 +I0906 13:34:01.397871 8300 net.cpp:369] conv3 -> conv3 +I0906 13:34:01.397886 8300 net.cpp:121] Setting up conv3 +I0906 13:34:01.513005 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:01.513030 8300 net.cpp:134] Memory required for data: 705521600 +I0906 13:34:01.513072 8300 layer_factory.hpp:74] Creating layer relu3 +I0906 13:34:01.513104 8300 net.cpp:91] Creating Layer relu3 +I0906 13:34:01.513120 8300 net.cpp:411] relu3 <- conv3 +I0906 13:34:01.513149 8300 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:34:01.513164 8300 net.cpp:121] Setting up relu3 +I0906 13:34:01.513173 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:01.513177 8300 net.cpp:134] Memory required for data: 731480000 +I0906 13:34:01.513182 8300 layer_factory.hpp:74] Creating layer conv4 +I0906 13:34:01.513208 8300 net.cpp:91] Creating Layer conv4 +I0906 13:34:01.513214 8300 net.cpp:411] conv4 <- conv3 +I0906 13:34:01.513229 8300 net.cpp:369] conv4 -> conv4 +I0906 13:34:01.513244 8300 net.cpp:121] Setting up conv4 +I0906 13:34:01.539248 8304 data_layer.cpp:120] Prefetch batch: 186 ms. +I0906 13:34:01.539295 8304 data_layer.cpp:121] Read time: 22.695 ms. +I0906 13:34:01.539304 8304 data_layer.cpp:122] Transform time: 161.707 ms. +I0906 13:34:01.598980 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:01.599004 8300 net.cpp:134] Memory required for data: 757438400 +I0906 13:34:01.599028 8300 layer_factory.hpp:74] Creating layer relu4 +I0906 13:34:01.599059 8300 net.cpp:91] Creating Layer relu4 +I0906 13:34:01.599074 8300 net.cpp:411] relu4 <- conv4 +I0906 13:34:01.599100 8300 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:34:01.599117 8300 net.cpp:121] Setting up relu4 +I0906 13:34:01.599125 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:01.599129 8300 net.cpp:134] Memory required for data: 783396800 +I0906 13:34:01.599134 8300 layer_factory.hpp:74] Creating layer conv5 +I0906 13:34:01.599158 8300 net.cpp:91] Creating Layer conv5 +I0906 13:34:01.599164 8300 net.cpp:411] conv5 <- conv4 +I0906 13:34:01.599177 8300 net.cpp:369] conv5 -> conv5 +I0906 13:34:01.599191 8300 net.cpp:121] Setting up conv5 +I0906 13:34:01.658185 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:01.658205 8300 net.cpp:134] Memory required for data: 800702400 +I0906 13:34:01.658242 8300 layer_factory.hpp:74] Creating layer relu5 +I0906 13:34:01.658269 8300 net.cpp:91] Creating Layer relu5 +I0906 13:34:01.658283 8300 net.cpp:411] relu5 <- conv5 +I0906 13:34:01.658308 8300 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:34:01.658321 8300 net.cpp:121] Setting up relu5 +I0906 13:34:01.658330 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:01.658334 8300 net.cpp:134] Memory required for data: 818008000 +I0906 13:34:01.658339 8300 layer_factory.hpp:74] Creating layer pool5 +I0906 13:34:01.658357 8300 net.cpp:91] Creating Layer pool5 +I0906 13:34:01.658362 8300 net.cpp:411] pool5 <- conv5 +I0906 13:34:01.658375 8300 net.cpp:369] pool5 -> pool5 +I0906 13:34:01.658390 8300 net.cpp:121] Setting up pool5 +I0906 13:34:01.658407 8300 net.cpp:128] Top shape: 100 256 6 6 (921600) +I0906 13:34:01.658412 8300 net.cpp:134] Memory required for data: 821694400 +I0906 13:34:01.658416 8300 layer_factory.hpp:74] Creating layer fc6 +I0906 13:34:01.658447 8300 net.cpp:91] Creating Layer fc6 +I0906 13:34:01.658453 8300 net.cpp:411] fc6 <- pool5 +I0906 13:34:01.658466 8300 net.cpp:369] fc6 -> fc6 +I0906 13:34:01.658480 8300 net.cpp:121] Setting up fc6 +I0906 13:34:06.571331 8300 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:06.571354 8300 net.cpp:134] Memory required for data: 823332800 +I0906 13:34:06.571382 8300 layer_factory.hpp:74] Creating layer relu6 +I0906 13:34:06.571415 8300 net.cpp:91] Creating Layer relu6 +I0906 13:34:06.571430 8300 net.cpp:411] relu6 <- fc6 +I0906 13:34:06.571456 8300 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:34:06.571521 8300 net.cpp:121] Setting up relu6 +I0906 13:34:06.571529 8300 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:06.571533 8300 net.cpp:134] Memory required for data: 824971200 +I0906 13:34:06.571538 8300 layer_factory.hpp:74] Creating layer fc7 +I0906 13:34:06.571558 8300 net.cpp:91] Creating Layer fc7 +I0906 13:34:06.571563 8300 net.cpp:411] fc7 <- fc6 +I0906 13:34:06.571578 8300 net.cpp:369] fc7 -> fc7 +I0906 13:34:06.571593 8300 net.cpp:121] Setting up fc7 +I0906 13:34:08.751106 8300 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:08.751129 8300 net.cpp:134] Memory required for data: 826609600 +I0906 13:34:08.751155 8300 layer_factory.hpp:74] Creating layer relu7 +I0906 13:34:08.751186 8300 net.cpp:91] Creating Layer relu7 +I0906 13:34:08.751202 8300 net.cpp:411] relu7 <- fc7 +I0906 13:34:08.751229 8300 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:34:08.751243 8300 net.cpp:121] Setting up relu7 +I0906 13:34:08.751251 8300 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:08.751255 8300 net.cpp:134] Memory required for data: 828248000 +I0906 13:34:08.751260 8300 layer_factory.hpp:74] Creating layer fc8 +I0906 13:34:08.751281 8300 net.cpp:91] Creating Layer fc8 +I0906 13:34:08.751286 8300 net.cpp:411] fc8 <- fc7 +I0906 13:34:08.751301 8300 net.cpp:369] fc8 -> fc8 +I0906 13:34:08.751315 8300 net.cpp:121] Setting up fc8 +I0906 13:34:09.287158 8300 net.cpp:128] Top shape: 100 1000 (100000) +I0906 13:34:09.287181 8300 net.cpp:134] Memory required for data: 828648000 +I0906 13:34:09.287209 8300 layer_factory.hpp:74] Creating layer loss +I0906 13:34:09.287257 8300 net.cpp:91] Creating Layer loss +I0906 13:34:09.287272 8300 net.cpp:411] loss <- fc8 +I0906 13:34:09.287295 8300 net.cpp:411] loss <- label +I0906 13:34:09.287313 8300 net.cpp:369] loss -> loss +I0906 13:34:09.287333 8300 net.cpp:121] Setting up loss +I0906 13:34:09.287349 8300 layer_factory.hpp:74] Creating layer loss +I0906 13:34:09.287860 8300 net.cpp:128] Top shape: (1) +I0906 13:34:09.287865 8300 net.cpp:130] with loss weight 1 +I0906 13:34:09.287881 8300 net.cpp:134] Memory required for data: 828648004 +I0906 13:34:09.287890 8300 net.cpp:193] loss needs backward computation. +I0906 13:34:09.287899 8300 net.cpp:193] fc8 needs backward computation. +I0906 13:34:09.287904 8300 net.cpp:193] relu7 needs backward computation. +I0906 13:34:09.287910 8300 net.cpp:193] fc7 needs backward computation. +I0906 13:34:09.287916 8300 net.cpp:193] relu6 needs backward computation. +I0906 13:34:09.287921 8300 net.cpp:193] fc6 needs backward computation. +I0906 13:34:09.287935 8300 net.cpp:193] pool5 needs backward computation. +I0906 13:34:09.287940 8300 net.cpp:193] relu5 needs backward computation. +I0906 13:34:09.287946 8300 net.cpp:193] conv5 needs backward computation. +I0906 13:34:09.287952 8300 net.cpp:193] relu4 needs backward computation. +I0906 13:34:09.287958 8300 net.cpp:193] conv4 needs backward computation. +I0906 13:34:09.287964 8300 net.cpp:193] relu3 needs backward computation. +I0906 13:34:09.287969 8300 net.cpp:193] conv3 needs backward computation. +I0906 13:34:09.287977 8300 net.cpp:193] pool2 needs backward computation. +I0906 13:34:09.287983 8300 net.cpp:193] norm2 needs backward computation. +I0906 13:34:09.287989 8300 net.cpp:193] relu2 needs backward computation. +I0906 13:34:09.287996 8300 net.cpp:193] conv2 needs backward computation. +I0906 13:34:09.288002 8300 net.cpp:193] pool1 needs backward computation. +I0906 13:34:09.288007 8300 net.cpp:193] norm1 needs backward computation. +I0906 13:34:09.288014 8300 net.cpp:193] relu1 needs backward computation. +I0906 13:34:09.288019 8300 net.cpp:193] conv1 needs backward computation. +I0906 13:34:09.288028 8300 net.cpp:195] data does not need backward computation. +I0906 13:34:09.288034 8300 net.cpp:236] This network produces output loss +I0906 13:34:09.288067 8300 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:34:09.288084 8300 net.cpp:248] Network initialization done. +I0906 13:34:09.288087 8300 net.cpp:249] Memory required for data: 828648004 +I0906 13:34:09.289022 8300 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:34:09.289130 8300 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data +I0906 13:34:09.289348 8300 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TEST +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:34:09.289656 8300 net.cpp:68] Memory required for data: 0 +I0906 13:34:09.289702 8300 layer_factory.hpp:74] Creating layer data +I0906 13:34:09.289721 8300 net.cpp:91] Creating Layer data +I0906 13:34:09.289731 8300 net.cpp:369] data -> data +I0906 13:34:09.289752 8300 net.cpp:369] data -> label +I0906 13:34:09.289764 8300 net.cpp:121] Setting up data +I0906 13:34:09.289772 8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:34:09.298058 8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb +I0906 13:34:09.298318 8300 data_layer.cpp:53] output data size: 50,3,227,227 +I0906 13:34:09.314699 8300 base_data_layer.cpp:43] Initializing prefetch +I0906 13:34:09.314806 8300 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:34:09.314834 8300 net.cpp:128] Top shape: 50 3 227 227 (7729350) +I0906 13:34:09.314843 8300 net.cpp:128] Top shape: 50 (50) +I0906 13:34:09.314848 8300 net.cpp:134] Memory required for data: 30917600 +I0906 13:34:09.314882 8300 layer_factory.hpp:74] Creating layer label_data_1_split +I0906 13:34:09.314973 8300 net.cpp:91] Creating Layer label_data_1_split +I0906 13:34:09.314997 8300 net.cpp:411] label_data_1_split <- label +I0906 13:34:09.315035 8300 net.cpp:369] label_data_1_split -> label_data_1_split_0 +I0906 13:34:09.315073 8300 net.cpp:369] label_data_1_split -> label_data_1_split_1 +I0906 13:34:09.315085 8300 net.cpp:121] Setting up label_data_1_split +I0906 13:34:09.315116 8300 net.cpp:128] Top shape: 50 (50) +I0906 13:34:09.315124 8300 net.cpp:128] Top shape: 50 (50) +I0906 13:34:09.315127 8300 net.cpp:134] Memory required for data: 30918000 +I0906 13:34:09.315131 8300 layer_factory.hpp:74] Creating layer conv1 +I0906 13:34:09.315165 8300 net.cpp:91] Creating Layer conv1 +I0906 13:34:09.315171 8300 net.cpp:411] conv1 <- data +I0906 13:34:09.315183 8300 net.cpp:369] conv1 -> conv1 +I0906 13:34:09.315198 8300 net.cpp:121] Setting up conv1 +I0906 13:34:09.319859 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:09.319864 8300 net.cpp:134] Memory required for data: 88998000 +I0906 13:34:09.319883 8300 layer_factory.hpp:74] Creating layer relu1 +I0906 13:34:09.319895 8300 net.cpp:91] Creating Layer relu1 +I0906 13:34:09.319901 8300 net.cpp:411] relu1 <- conv1 +I0906 13:34:09.319913 8300 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:34:09.319926 8300 net.cpp:121] Setting up relu1 +I0906 13:34:09.319933 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:09.319937 8300 net.cpp:134] Memory required for data: 147078000 +I0906 13:34:09.319942 8300 layer_factory.hpp:74] Creating layer norm1 +I0906 13:34:09.319962 8300 net.cpp:91] Creating Layer norm1 +I0906 13:34:09.319968 8300 net.cpp:411] norm1 <- conv1 +I0906 13:34:09.319980 8300 net.cpp:369] norm1 -> norm1 +I0906 13:34:09.319991 8300 net.cpp:121] Setting up norm1 +I0906 13:34:09.320009 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:09.320053 8300 net.cpp:134] Memory required for data: 205158000 +I0906 13:34:09.320060 8300 layer_factory.hpp:74] Creating layer pool1 +I0906 13:34:09.320075 8300 net.cpp:91] Creating Layer pool1 +I0906 13:34:09.320081 8300 net.cpp:411] pool1 <- norm1 +I0906 13:34:09.320093 8300 net.cpp:369] pool1 -> pool1 +I0906 13:34:09.320103 8300 net.cpp:121] Setting up pool1 +I0906 13:34:09.320122 8300 net.cpp:128] Top shape: 50 96 27 27 (3499200) +I0906 13:34:09.320125 8300 net.cpp:134] Memory required for data: 219154800 +I0906 13:34:09.320130 8300 layer_factory.hpp:74] Creating layer conv2 +I0906 13:34:09.320143 8300 net.cpp:91] Creating Layer conv2 +I0906 13:34:09.320149 8300 net.cpp:411] conv2 <- pool1 +I0906 13:34:09.320163 8300 net.cpp:369] conv2 -> conv2 +I0906 13:34:09.320174 8300 net.cpp:121] Setting up conv2 +I0906 13:34:09.359275 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:09.359290 8300 net.cpp:134] Memory required for data: 256479600 +I0906 13:34:09.359316 8300 layer_factory.hpp:74] Creating layer relu2 +I0906 13:34:09.359336 8300 net.cpp:91] Creating Layer relu2 +I0906 13:34:09.359346 8300 net.cpp:411] relu2 <- conv2 +I0906 13:34:09.359365 8300 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:34:09.359395 8300 net.cpp:121] Setting up relu2 +I0906 13:34:09.359403 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:09.359407 8300 net.cpp:134] Memory required for data: 293804400 +I0906 13:34:09.359412 8300 layer_factory.hpp:74] Creating layer norm2 +I0906 13:34:09.359433 8300 net.cpp:91] Creating Layer norm2 +I0906 13:34:09.359438 8300 net.cpp:411] norm2 <- conv2 +I0906 13:34:09.359452 8300 net.cpp:369] norm2 -> norm2 +I0906 13:34:09.359467 8300 net.cpp:121] Setting up norm2 +I0906 13:34:09.359486 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:09.359490 8300 net.cpp:134] Memory required for data: 331129200 +I0906 13:34:09.359495 8300 layer_factory.hpp:74] Creating layer pool2 +I0906 13:34:09.359508 8300 net.cpp:91] Creating Layer pool2 +I0906 13:34:09.359514 8300 net.cpp:411] pool2 <- norm2 +I0906 13:34:09.359526 8300 net.cpp:369] pool2 -> pool2 +I0906 13:34:09.359537 8300 net.cpp:121] Setting up pool2 +I0906 13:34:09.359555 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:09.359558 8300 net.cpp:134] Memory required for data: 339782000 +I0906 13:34:09.359563 8300 layer_factory.hpp:74] Creating layer conv3 +I0906 13:34:09.359581 8300 net.cpp:91] Creating Layer conv3 +I0906 13:34:09.359587 8300 net.cpp:411] conv3 <- pool2 +I0906 13:34:09.359601 8300 net.cpp:369] conv3 -> conv3 +I0906 13:34:09.359613 8300 net.cpp:121] Setting up conv3 +I0906 13:34:09.410833 8305 data_layer.cpp:120] Prefetch batch: 95 ms. +I0906 13:34:09.410863 8305 data_layer.cpp:121] Read time: 11.984 ms. +I0906 13:34:09.410871 8305 data_layer.cpp:122] Transform time: 82.885 ms. +I0906 13:34:09.474556 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:09.474578 8300 net.cpp:134] Memory required for data: 352761200 +I0906 13:34:09.474618 8300 layer_factory.hpp:74] Creating layer relu3 +I0906 13:34:09.474648 8300 net.cpp:91] Creating Layer relu3 +I0906 13:34:09.474663 8300 net.cpp:411] relu3 <- conv3 +I0906 13:34:09.474689 8300 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:34:09.474704 8300 net.cpp:121] Setting up relu3 +I0906 13:34:09.474714 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:09.474717 8300 net.cpp:134] Memory required for data: 365740400 +I0906 13:34:09.474721 8300 layer_factory.hpp:74] Creating layer conv4 +I0906 13:34:09.474745 8300 net.cpp:91] Creating Layer conv4 +I0906 13:34:09.474751 8300 net.cpp:411] conv4 <- conv3 +I0906 13:34:09.474766 8300 net.cpp:369] conv4 -> conv4 +I0906 13:34:09.474781 8300 net.cpp:121] Setting up conv4 +I0906 13:34:09.562909 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:09.562930 8300 net.cpp:134] Memory required for data: 378719600 +I0906 13:34:09.562957 8300 layer_factory.hpp:74] Creating layer relu4 +I0906 13:34:09.562988 8300 net.cpp:91] Creating Layer relu4 +I0906 13:34:09.563051 8300 net.cpp:411] relu4 <- conv4 +I0906 13:34:09.563086 8300 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:34:09.563102 8300 net.cpp:121] Setting up relu4 +I0906 13:34:09.563112 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:09.563117 8300 net.cpp:134] Memory required for data: 391698800 +I0906 13:34:09.563122 8300 layer_factory.hpp:74] Creating layer conv5 +I0906 13:34:09.563146 8300 net.cpp:91] Creating Layer conv5 +I0906 13:34:09.563153 8300 net.cpp:411] conv5 <- conv4 +I0906 13:34:09.563168 8300 net.cpp:369] conv5 -> conv5 +I0906 13:34:09.563182 8300 net.cpp:121] Setting up conv5 +I0906 13:34:09.619202 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:09.619220 8300 net.cpp:134] Memory required for data: 400351600 +I0906 13:34:09.619256 8300 layer_factory.hpp:74] Creating layer relu5 +I0906 13:34:09.619284 8300 net.cpp:91] Creating Layer relu5 +I0906 13:34:09.619298 8300 net.cpp:411] relu5 <- conv5 +I0906 13:34:09.619321 8300 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:34:09.619336 8300 net.cpp:121] Setting up relu5 +I0906 13:34:09.619344 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:09.619349 8300 net.cpp:134] Memory required for data: 409004400 +I0906 13:34:09.619354 8300 layer_factory.hpp:74] Creating layer pool5 +I0906 13:34:09.619380 8300 net.cpp:91] Creating Layer pool5 +I0906 13:34:09.619386 8300 net.cpp:411] pool5 <- conv5 +I0906 13:34:09.619398 8300 net.cpp:369] pool5 -> pool5 +I0906 13:34:09.619411 8300 net.cpp:121] Setting up pool5 +I0906 13:34:09.619431 8300 net.cpp:128] Top shape: 50 256 6 6 (460800) +I0906 13:34:09.619434 8300 net.cpp:134] Memory required for data: 410847600 +I0906 13:34:09.619439 8300 layer_factory.hpp:74] Creating layer fc6 +I0906 13:34:09.619457 8300 net.cpp:91] Creating Layer fc6 +I0906 13:34:09.619463 8300 net.cpp:411] fc6 <- pool5 +I0906 13:34:09.619477 8300 net.cpp:369] fc6 -> fc6 +I0906 13:34:09.619488 8300 net.cpp:121] Setting up fc6 +I0906 13:34:15.320122 8300 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:15.320147 8300 net.cpp:134] Memory required for data: 411666800 +I0906 13:34:15.320174 8300 layer_factory.hpp:74] Creating layer relu6 +I0906 13:34:15.320206 8300 net.cpp:91] Creating Layer relu6 +I0906 13:34:15.320222 8300 net.cpp:411] relu6 <- fc6 +I0906 13:34:15.320248 8300 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:34:15.320263 8300 net.cpp:121] Setting up relu6 +I0906 13:34:15.320272 8300 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:15.320276 8300 net.cpp:134] Memory required for data: 412486000 +I0906 13:34:15.320281 8300 layer_factory.hpp:74] Creating layer fc7 +I0906 13:34:15.320302 8300 net.cpp:91] Creating Layer fc7 +I0906 13:34:15.320308 8300 net.cpp:411] fc7 <- fc6 +I0906 13:34:15.320322 8300 net.cpp:369] fc7 -> fc7 +I0906 13:34:15.320338 8300 net.cpp:121] Setting up fc7 +I0906 13:34:17.700968 8300 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:17.700994 8300 net.cpp:134] Memory required for data: 413305200 +I0906 13:34:17.701020 8300 layer_factory.hpp:74] Creating layer relu7 +I0906 13:34:17.701052 8300 net.cpp:91] Creating Layer relu7 +I0906 13:34:17.701067 8300 net.cpp:411] relu7 <- fc7 +I0906 13:34:17.701093 8300 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:34:17.701109 8300 net.cpp:121] Setting up relu7 +I0906 13:34:17.701117 8300 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:17.701122 8300 net.cpp:134] Memory required for data: 414124400 +I0906 13:34:17.701125 8300 layer_factory.hpp:74] Creating layer fc8 +I0906 13:34:17.701146 8300 net.cpp:91] Creating Layer fc8 +I0906 13:34:17.701153 8300 net.cpp:411] fc8 <- fc7 +I0906 13:34:17.701166 8300 net.cpp:369] fc8 -> fc8 +I0906 13:34:17.701191 8300 net.cpp:121] Setting up fc8 +I0906 13:34:18.224659 8300 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:18.224681 8300 net.cpp:134] Memory required for data: 414324400 +I0906 13:34:18.224707 8300 layer_factory.hpp:74] Creating layer fc8_fc8_0_split +I0906 13:34:18.224737 8300 net.cpp:91] Creating Layer fc8_fc8_0_split +I0906 13:34:18.224798 8300 net.cpp:411] fc8_fc8_0_split <- fc8 +I0906 13:34:18.224828 8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 +I0906 13:34:18.224848 8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 +I0906 13:34:18.224860 8300 net.cpp:121] Setting up fc8_fc8_0_split +I0906 13:34:18.224876 8300 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:18.224882 8300 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:18.224886 8300 net.cpp:134] Memory required for data: 414724400 +I0906 13:34:18.224891 8300 layer_factory.hpp:74] Creating layer accuracy +I0906 13:34:18.224922 8300 net.cpp:91] Creating Layer accuracy +I0906 13:34:18.224927 8300 net.cpp:411] accuracy <- fc8_fc8_0_split_0 +I0906 13:34:18.224938 8300 net.cpp:411] accuracy <- label_data_1_split_0 +I0906 13:34:18.224949 8300 net.cpp:369] accuracy -> accuracy +I0906 13:34:18.224961 8300 net.cpp:121] Setting up accuracy +I0906 13:34:18.224977 8300 net.cpp:128] Top shape: (1) +I0906 13:34:18.224980 8300 net.cpp:134] Memory required for data: 414724404 +I0906 13:34:18.224985 8300 layer_factory.hpp:74] Creating layer loss +I0906 13:34:18.224997 8300 net.cpp:91] Creating Layer loss +I0906 13:34:18.225003 8300 net.cpp:411] loss <- fc8_fc8_0_split_1 +I0906 13:34:18.225013 8300 net.cpp:411] loss <- label_data_1_split_1 +I0906 13:34:18.225023 8300 net.cpp:369] loss -> loss +I0906 13:34:18.225033 8300 net.cpp:121] Setting up loss +I0906 13:34:18.225044 8300 layer_factory.hpp:74] Creating layer loss +I0906 13:34:18.225343 8300 net.cpp:128] Top shape: (1) +I0906 13:34:18.225348 8300 net.cpp:130] with loss weight 1 +I0906 13:34:18.225364 8300 net.cpp:134] Memory required for data: 414724408 +I0906 13:34:18.225371 8300 net.cpp:193] loss needs backward computation. +I0906 13:34:18.225378 8300 net.cpp:195] accuracy does not need backward computation. +I0906 13:34:18.225386 8300 net.cpp:193] fc8_fc8_0_split needs backward computation. +I0906 13:34:18.225391 8300 net.cpp:193] fc8 needs backward computation. +I0906 13:34:18.225397 8300 net.cpp:193] relu7 needs backward computation. +I0906 13:34:18.225404 8300 net.cpp:193] fc7 needs backward computation. +I0906 13:34:18.225409 8300 net.cpp:193] relu6 needs backward computation. +I0906 13:34:18.225414 8300 net.cpp:193] fc6 needs backward computation. +I0906 13:34:18.225420 8300 net.cpp:193] pool5 needs backward computation. +I0906 13:34:18.225426 8300 net.cpp:193] relu5 needs backward computation. +I0906 13:34:18.225431 8300 net.cpp:193] conv5 needs backward computation. +I0906 13:34:18.225438 8300 net.cpp:193] relu4 needs backward computation. +I0906 13:34:18.225443 8300 net.cpp:193] conv4 needs backward computation. +I0906 13:34:18.225450 8300 net.cpp:193] relu3 needs backward computation. +I0906 13:34:18.225455 8300 net.cpp:193] conv3 needs backward computation. +I0906 13:34:18.225461 8300 net.cpp:193] pool2 needs backward computation. +I0906 13:34:18.225466 8300 net.cpp:193] norm2 needs backward computation. +I0906 13:34:18.225472 8300 net.cpp:193] relu2 needs backward computation. +I0906 13:34:18.225477 8300 net.cpp:193] conv2 needs backward computation. +I0906 13:34:18.225484 8300 net.cpp:193] pool1 needs backward computation. +I0906 13:34:18.225491 8300 net.cpp:193] norm1 needs backward computation. +I0906 13:34:18.225496 8300 net.cpp:193] relu1 needs backward computation. +I0906 13:34:18.225502 8300 net.cpp:193] conv1 needs backward computation. +I0906 13:34:18.225508 8300 net.cpp:195] label_data_1_split does not need backward computation. +I0906 13:34:18.225515 8300 net.cpp:195] data does not need backward computation. +I0906 13:34:18.225520 8300 net.cpp:236] This network produces output accuracy +I0906 13:34:18.225527 8300 net.cpp:236] This network produces output loss +I0906 13:34:18.225561 8300 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:34:18.225574 8300 net.cpp:248] Network initialization done. +I0906 13:34:18.225579 8300 net.cpp:249] Memory required for data: 414724408 +I0906 13:34:18.225764 8300 solver.cpp:53] Solver scaffolding done. +I0906 13:34:18.225879 8300 solver.cpp:270] Solving AlexNet +I0906 13:34:18.225898 8300 solver.cpp:271] Learning Rate Policy: step +I0906 13:34:18.227551 8300 solver.cpp:314] Iteration 0, Testing net (#0) +I0906 13:34:18.227571 8300 net.cpp:696] Copying source layer data +I0906 13:34:18.227577 8300 net.cpp:696] Copying source layer conv1 +I0906 13:34:18.230358 8300 net.cpp:696] Copying source layer relu1 +I0906 13:34:18.230398 8300 net.cpp:696] Copying source layer norm1 +I0906 13:34:18.230409 8300 net.cpp:696] Copying source layer pool1 +I0906 13:34:18.230419 8300 net.cpp:696] Copying source layer conv2 +I0906 13:34:18.230605 8300 net.cpp:696] Copying source layer relu2 +I0906 13:34:18.230624 8300 net.cpp:696] Copying source layer norm2 +I0906 13:34:18.230634 8300 net.cpp:696] Copying source layer pool2 +I0906 13:34:18.230644 8300 net.cpp:696] Copying source layer conv3 +I0906 13:34:18.231482 8300 net.cpp:696] Copying source layer relu3 +I0906 13:34:18.231510 8300 net.cpp:696] Copying source layer conv4 +I0906 13:34:18.232178 8300 net.cpp:696] Copying source layer relu4 +I0906 13:34:18.232195 8300 net.cpp:696] Copying source layer conv5 +I0906 13:34:18.232681 8300 net.cpp:696] Copying source layer relu5 +I0906 13:34:18.232697 8300 net.cpp:696] Copying source layer pool5 +I0906 13:34:18.232708 8300 net.cpp:696] Copying source layer fc6 +I0906 13:34:18.250728 8300 net.cpp:696] Copying source layer relu6 +I0906 13:34:18.250753 8300 net.cpp:696] Copying source layer fc7 +I0906 13:34:18.257216 8300 net.cpp:696] Copying source layer relu7 +I0906 13:34:18.257241 8300 net.cpp:696] Copying source layer fc8 +I0906 13:34:18.258977 8300 net.cpp:696] Copying source layer loss +I0906 13:34:18.259091 8300 base_data_layer.cpp:89] Thread joined +I0906 13:34:18.263509 8300 base_data_layer.cpp:93] Prefetch copied +I0906 13:34:18.263875 8300 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:34:18.362475 8306 data_layer.cpp:120] Prefetch batch: 98 ms. +I0906 13:34:18.362507 8306 data_layer.cpp:121] Read time: 12.694 ms. +I0906 13:34:18.362515 8306 data_layer.cpp:122] Transform time: 84.611 ms. +I0906 13:34:21.291707 8300 solver.cpp:363] Test net output #0: accuracy = 0 +I0906 13:34:21.291733 8300 solver.cpp:363] Test net output #1: loss = 6.91228 (* 1 = 6.91228 loss) +I0906 13:34:21.291775 8300 base_data_layer.cpp:89] Thread joined +I0906 13:34:21.300678 8300 base_data_layer.cpp:93] Prefetch copied +I0906 13:34:21.301050 8300 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:34:21.491194 8310 data_layer.cpp:120] Prefetch batch: 189 ms. +I0906 13:34:21.491225 8310 data_layer.cpp:121] Read time: 24.533 ms. +I0906 13:34:21.491231 8310 data_layer.cpp:122] Transform time: 163.65 ms. +I0906 13:34:28.088075 8300 solver.cpp:234] Iteration 0, loss = 0 +I0906 13:34:28.088134 8300 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) +I0906 13:34:28.088184 8300 solver.cpp:506] Iteration 0, lr = 0.01 +I0906 13:34:28.203598 8300 base_data_layer.cpp:89] Thread joined +I0906 13:34:28.212023 8300 base_data_layer.cpp:93] Prefetch copied +I0906 13:34:28.212162 8300 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:34:28.397155 8312 data_layer.cpp:120] Prefetch batch: 184 ms. +I0906 13:34:28.397193 8312 data_layer.cpp:121] Read time: 23.16 ms. +I0906 13:34:28.397200 8312 data_layer.cpp:122] Transform time: 159.902 ms. +I0906 13:34:30.978493 8300 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 new file mode 100644 index 00000000..93afd4cf --- /dev/null +++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 @@ -0,0 +1,1208 @@ +Log file created at: 2015/09/06 13:34:37 +Running on machine: AMD-RESEARCH +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0906 13:34:37.585557 8316 caffe.cpp:114] Use GPU with device ID 0 +I0906 13:34:37.621670 8316 device.cpp:230] Number of platforms found:1 +I0906 13:34:37.621708 8316 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing +I0906 13:34:37.621721 8316 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE +I0906 13:34:37.621724 8316 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) +I0906 13:34:37.621728 8316 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. +I0906 13:34:37.621732 8316 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices +I0906 13:34:37.621739 8316 device.cpp:286] Number of devices found:1 +I0906 13:34:37.621743 8316 device.cpp:288] DeviceID: 0x22ed250 +I0906 13:34:37.621760 8316 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU +I0906 13:34:37.621767 8316 device.cpp:393] Is it integrated GPU?: 0 +I0906 13:34:37.621772 8316 device.cpp:393] Max clock frequency MHz: 930 +I0906 13:34:37.621775 8316 device.cpp:393] Host-Device unified mem: 0 +I0906 13:34:37.621779 8316 device.cpp:393] ECC support: 0 +I0906 13:34:37.621783 8316 device.cpp:393] Endian little: 1 +I0906 13:34:37.621788 8316 device.cpp:393] Max compute units: 44 +I0906 13:34:37.621791 8316 device.cpp:393] Max work group size: 256 +I0906 13:34:37.621796 8316 device.cpp:393] Max work item dimensions: 3 +I0906 13:34:37.621801 8316 device.cpp:393] Max work item sizes: 0x100 +I0906 13:34:37.621806 8316 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE +I0906 13:34:37.621811 8316 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL +I0906 13:34:37.621815 8316 device.cpp:393] Max mem alloc size: 4244635648 +I0906 13:34:37.621819 8316 device.cpp:393] Global mem size: 16878927872 +I0906 13:34:37.621822 8316 device.cpp:393] Local mem size: 32768 +I0906 13:34:37.621830 8316 device.cpp:96] Picked device type : GPU 0 +I0906 13:34:40.036291 8316 device.cpp:152] Build Program +I0906 13:34:40.036520 8316 caffe.cpp:122] Starting Optimization +I0906 13:34:40.036612 8316 solver.cpp:40] Initializing solver from parameters: +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +display: 1 +max_iter: 10 +lr_policy: "step" +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +stepsize: 100000 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +I0906 13:34:40.036731 8316 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:34:40.037874 8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data +I0906 13:34:40.037925 8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy +I0906 13:34:40.038099 8316 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TRAIN +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 100 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:34:40.038537 8316 net.cpp:68] Memory required for data: 0 +I0906 13:34:40.038749 8316 layer_factory.hpp:74] Creating layer data +I0906 13:34:40.038802 8316 net.cpp:91] Creating Layer data +I0906 13:34:40.038825 8316 net.cpp:369] data -> data +I0906 13:34:40.038928 8316 net.cpp:369] data -> label +I0906 13:34:40.038950 8316 net.cpp:121] Setting up data +I0906 13:34:40.038962 8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:34:40.048738 8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb +I0906 13:34:40.049080 8316 data_layer.cpp:53] output data size: 100,3,227,227 +I0906 13:34:40.081225 8316 base_data_layer.cpp:43] Initializing prefetch +I0906 13:34:40.081426 8316 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:34:40.081490 8316 net.cpp:128] Top shape: 100 3 227 227 (15458700) +I0906 13:34:40.081500 8316 net.cpp:128] Top shape: 100 (100) +I0906 13:34:40.081504 8316 net.cpp:134] Memory required for data: 61835200 +I0906 13:34:40.081537 8316 layer_factory.hpp:74] Creating layer conv1 +I0906 13:34:40.081619 8316 net.cpp:91] Creating Layer conv1 +I0906 13:34:40.081641 8316 net.cpp:411] conv1 <- data +I0906 13:34:40.081694 8316 net.cpp:369] conv1 -> conv1 +I0906 13:34:40.081758 8316 net.cpp:121] Setting up conv1 +I0906 13:34:40.088135 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:40.088160 8316 net.cpp:134] Memory required for data: 177995200 +I0906 13:34:40.088239 8316 layer_factory.hpp:74] Creating layer relu1 +I0906 13:34:40.088297 8316 net.cpp:91] Creating Layer relu1 +I0906 13:34:40.088315 8316 net.cpp:411] relu1 <- conv1 +I0906 13:34:40.088351 8316 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:34:40.088372 8316 net.cpp:121] Setting up relu1 +I0906 13:34:40.088385 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:40.088390 8316 net.cpp:134] Memory required for data: 294155200 +I0906 13:34:40.088397 8316 layer_factory.hpp:74] Creating layer norm1 +I0906 13:34:40.088435 8316 net.cpp:91] Creating Layer norm1 +I0906 13:34:40.088444 8316 net.cpp:411] norm1 <- conv1 +I0906 13:34:40.088466 8316 net.cpp:369] norm1 -> norm1 +I0906 13:34:40.088486 8316 net.cpp:121] Setting up norm1 +I0906 13:34:40.088531 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:34:40.088537 8316 net.cpp:134] Memory required for data: 410315200 +I0906 13:34:40.088543 8316 layer_factory.hpp:74] Creating layer pool1 +I0906 13:34:40.088580 8316 net.cpp:91] Creating Layer pool1 +I0906 13:34:40.088590 8316 net.cpp:411] pool1 <- norm1 +I0906 13:34:40.088613 8316 net.cpp:369] pool1 -> pool1 +I0906 13:34:40.088637 8316 net.cpp:121] Setting up pool1 +I0906 13:34:40.088686 8316 net.cpp:128] Top shape: 100 96 27 27 (6998400) +I0906 13:34:40.088691 8316 net.cpp:134] Memory required for data: 438308800 +I0906 13:34:40.088701 8316 layer_factory.hpp:74] Creating layer conv2 +I0906 13:34:40.088739 8316 net.cpp:91] Creating Layer conv2 +I0906 13:34:40.088750 8316 net.cpp:411] conv2 <- pool1 +I0906 13:34:40.088783 8316 net.cpp:369] conv2 -> conv2 +I0906 13:34:40.088804 8316 net.cpp:121] Setting up conv2 +I0906 13:34:40.129534 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:40.129550 8316 net.cpp:134] Memory required for data: 512958400 +I0906 13:34:40.129585 8316 layer_factory.hpp:74] Creating layer relu2 +I0906 13:34:40.129613 8316 net.cpp:91] Creating Layer relu2 +I0906 13:34:40.129624 8316 net.cpp:411] relu2 <- conv2 +I0906 13:34:40.129647 8316 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:34:40.129662 8316 net.cpp:121] Setting up relu2 +I0906 13:34:40.129670 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:40.129674 8316 net.cpp:134] Memory required for data: 587608000 +I0906 13:34:40.129679 8316 layer_factory.hpp:74] Creating layer norm2 +I0906 13:34:40.129698 8316 net.cpp:91] Creating Layer norm2 +I0906 13:34:40.129703 8316 net.cpp:411] norm2 <- conv2 +I0906 13:34:40.129717 8316 net.cpp:369] norm2 -> norm2 +I0906 13:34:40.129730 8316 net.cpp:121] Setting up norm2 +I0906 13:34:40.129750 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:34:40.129755 8316 net.cpp:134] Memory required for data: 662257600 +I0906 13:34:40.129760 8316 layer_factory.hpp:74] Creating layer pool2 +I0906 13:34:40.129783 8316 net.cpp:91] Creating Layer pool2 +I0906 13:34:40.129789 8316 net.cpp:411] pool2 <- norm2 +I0906 13:34:40.129802 8316 net.cpp:369] pool2 -> pool2 +I0906 13:34:40.129813 8316 net.cpp:121] Setting up pool2 +I0906 13:34:40.129832 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:40.129837 8316 net.cpp:134] Memory required for data: 679563200 +I0906 13:34:40.129887 8316 layer_factory.hpp:74] Creating layer conv3 +I0906 13:34:40.129910 8316 net.cpp:91] Creating Layer conv3 +I0906 13:34:40.129916 8316 net.cpp:411] conv3 <- pool2 +I0906 13:34:40.129933 8316 net.cpp:369] conv3 -> conv3 +I0906 13:34:40.129948 8316 net.cpp:121] Setting up conv3 +I0906 13:34:40.246141 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:40.246165 8316 net.cpp:134] Memory required for data: 705521600 +I0906 13:34:40.246211 8316 layer_factory.hpp:74] Creating layer relu3 +I0906 13:34:40.246247 8316 net.cpp:91] Creating Layer relu3 +I0906 13:34:40.246261 8316 net.cpp:411] relu3 <- conv3 +I0906 13:34:40.246287 8316 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:34:40.246304 8316 net.cpp:121] Setting up relu3 +I0906 13:34:40.246314 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:40.246317 8316 net.cpp:134] Memory required for data: 731480000 +I0906 13:34:40.246322 8316 layer_factory.hpp:74] Creating layer conv4 +I0906 13:34:40.246351 8316 net.cpp:91] Creating Layer conv4 +I0906 13:34:40.246356 8316 net.cpp:411] conv4 <- conv3 +I0906 13:34:40.246372 8316 net.cpp:369] conv4 -> conv4 +I0906 13:34:40.246387 8316 net.cpp:121] Setting up conv4 +I0906 13:34:40.273671 8320 data_layer.cpp:120] Prefetch batch: 191 ms. +I0906 13:34:40.273718 8320 data_layer.cpp:121] Read time: 24.494 ms. +I0906 13:34:40.273727 8320 data_layer.cpp:122] Transform time: 165.29 ms. +I0906 13:34:40.332166 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:40.332187 8316 net.cpp:134] Memory required for data: 757438400 +I0906 13:34:40.332214 8316 layer_factory.hpp:74] Creating layer relu4 +I0906 13:34:40.332247 8316 net.cpp:91] Creating Layer relu4 +I0906 13:34:40.332262 8316 net.cpp:411] relu4 <- conv4 +I0906 13:34:40.332288 8316 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:34:40.332304 8316 net.cpp:121] Setting up relu4 +I0906 13:34:40.332314 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:34:40.332317 8316 net.cpp:134] Memory required for data: 783396800 +I0906 13:34:40.332321 8316 layer_factory.hpp:74] Creating layer conv5 +I0906 13:34:40.332350 8316 net.cpp:91] Creating Layer conv5 +I0906 13:34:40.332355 8316 net.cpp:411] conv5 <- conv4 +I0906 13:34:40.332371 8316 net.cpp:369] conv5 -> conv5 +I0906 13:34:40.332386 8316 net.cpp:121] Setting up conv5 +I0906 13:34:40.388872 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:40.388891 8316 net.cpp:134] Memory required for data: 800702400 +I0906 13:34:40.388931 8316 layer_factory.hpp:74] Creating layer relu5 +I0906 13:34:40.388959 8316 net.cpp:91] Creating Layer relu5 +I0906 13:34:40.388972 8316 net.cpp:411] relu5 <- conv5 +I0906 13:34:40.388995 8316 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:34:40.389010 8316 net.cpp:121] Setting up relu5 +I0906 13:34:40.389019 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:34:40.389024 8316 net.cpp:134] Memory required for data: 818008000 +I0906 13:34:40.389029 8316 layer_factory.hpp:74] Creating layer pool5 +I0906 13:34:40.389049 8316 net.cpp:91] Creating Layer pool5 +I0906 13:34:40.389053 8316 net.cpp:411] pool5 <- conv5 +I0906 13:34:40.389067 8316 net.cpp:369] pool5 -> pool5 +I0906 13:34:40.389081 8316 net.cpp:121] Setting up pool5 +I0906 13:34:40.389102 8316 net.cpp:128] Top shape: 100 256 6 6 (921600) +I0906 13:34:40.389107 8316 net.cpp:134] Memory required for data: 821694400 +I0906 13:34:40.389112 8316 layer_factory.hpp:74] Creating layer fc6 +I0906 13:34:40.389147 8316 net.cpp:91] Creating Layer fc6 +I0906 13:34:40.389153 8316 net.cpp:411] fc6 <- pool5 +I0906 13:34:40.389169 8316 net.cpp:369] fc6 -> fc6 +I0906 13:34:40.389183 8316 net.cpp:121] Setting up fc6 +I0906 13:34:45.208031 8316 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:45.208055 8316 net.cpp:134] Memory required for data: 823332800 +I0906 13:34:45.208081 8316 layer_factory.hpp:74] Creating layer relu6 +I0906 13:34:45.208112 8316 net.cpp:91] Creating Layer relu6 +I0906 13:34:45.208128 8316 net.cpp:411] relu6 <- fc6 +I0906 13:34:45.208154 8316 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:34:45.208210 8316 net.cpp:121] Setting up relu6 +I0906 13:34:45.208220 8316 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:45.208223 8316 net.cpp:134] Memory required for data: 824971200 +I0906 13:34:45.208228 8316 layer_factory.hpp:74] Creating layer fc7 +I0906 13:34:45.208250 8316 net.cpp:91] Creating Layer fc7 +I0906 13:34:45.208256 8316 net.cpp:411] fc7 <- fc6 +I0906 13:34:45.208273 8316 net.cpp:369] fc7 -> fc7 +I0906 13:34:45.208288 8316 net.cpp:121] Setting up fc7 +I0906 13:34:47.352208 8316 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:47.352234 8316 net.cpp:134] Memory required for data: 826609600 +I0906 13:34:47.352262 8316 layer_factory.hpp:74] Creating layer relu7 +I0906 13:34:47.352295 8316 net.cpp:91] Creating Layer relu7 +I0906 13:34:47.352311 8316 net.cpp:411] relu7 <- fc7 +I0906 13:34:47.352339 8316 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:34:47.352355 8316 net.cpp:121] Setting up relu7 +I0906 13:34:47.352363 8316 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:34:47.352368 8316 net.cpp:134] Memory required for data: 828248000 +I0906 13:34:47.352373 8316 layer_factory.hpp:74] Creating layer fc8 +I0906 13:34:47.352396 8316 net.cpp:91] Creating Layer fc8 +I0906 13:34:47.352402 8316 net.cpp:411] fc8 <- fc7 +I0906 13:34:47.352418 8316 net.cpp:369] fc8 -> fc8 +I0906 13:34:47.352433 8316 net.cpp:121] Setting up fc8 +I0906 13:34:47.878074 8316 net.cpp:128] Top shape: 100 1000 (100000) +I0906 13:34:47.878098 8316 net.cpp:134] Memory required for data: 828648000 +I0906 13:34:47.878126 8316 layer_factory.hpp:74] Creating layer loss +I0906 13:34:47.878178 8316 net.cpp:91] Creating Layer loss +I0906 13:34:47.878195 8316 net.cpp:411] loss <- fc8 +I0906 13:34:47.878217 8316 net.cpp:411] loss <- label +I0906 13:34:47.878237 8316 net.cpp:369] loss -> loss +I0906 13:34:47.878255 8316 net.cpp:121] Setting up loss +I0906 13:34:47.878273 8316 layer_factory.hpp:74] Creating layer loss +I0906 13:34:47.878825 8316 net.cpp:128] Top shape: (1) +I0906 13:34:47.878831 8316 net.cpp:130] with loss weight 1 +I0906 13:34:47.878847 8316 net.cpp:134] Memory required for data: 828648004 +I0906 13:34:47.878856 8316 net.cpp:193] loss needs backward computation. +I0906 13:34:47.878865 8316 net.cpp:193] fc8 needs backward computation. +I0906 13:34:47.878870 8316 net.cpp:193] relu7 needs backward computation. +I0906 13:34:47.878876 8316 net.cpp:193] fc7 needs backward computation. +I0906 13:34:47.878882 8316 net.cpp:193] relu6 needs backward computation. +I0906 13:34:47.878888 8316 net.cpp:193] fc6 needs backward computation. +I0906 13:34:47.878895 8316 net.cpp:193] pool5 needs backward computation. +I0906 13:34:47.878901 8316 net.cpp:193] relu5 needs backward computation. +I0906 13:34:47.878906 8316 net.cpp:193] conv5 needs backward computation. +I0906 13:34:47.878911 8316 net.cpp:193] relu4 needs backward computation. +I0906 13:34:47.878917 8316 net.cpp:193] conv4 needs backward computation. +I0906 13:34:47.878923 8316 net.cpp:193] relu3 needs backward computation. +I0906 13:34:47.878928 8316 net.cpp:193] conv3 needs backward computation. +I0906 13:34:47.878936 8316 net.cpp:193] pool2 needs backward computation. +I0906 13:34:47.878942 8316 net.cpp:193] norm2 needs backward computation. +I0906 13:34:47.878948 8316 net.cpp:193] relu2 needs backward computation. +I0906 13:34:47.878953 8316 net.cpp:193] conv2 needs backward computation. +I0906 13:34:47.878959 8316 net.cpp:193] pool1 needs backward computation. +I0906 13:34:47.878965 8316 net.cpp:193] norm1 needs backward computation. +I0906 13:34:47.878972 8316 net.cpp:193] relu1 needs backward computation. +I0906 13:34:47.878978 8316 net.cpp:193] conv1 needs backward computation. +I0906 13:34:47.878984 8316 net.cpp:195] data does not need backward computation. +I0906 13:34:47.878993 8316 net.cpp:236] This network produces output loss +I0906 13:34:47.879026 8316 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:34:47.879042 8316 net.cpp:248] Network initialization done. +I0906 13:34:47.879045 8316 net.cpp:249] Memory required for data: 828648004 +I0906 13:34:47.880003 8316 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:34:47.880131 8316 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data +I0906 13:34:47.880362 8316 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TEST +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:34:47.880718 8316 net.cpp:68] Memory required for data: 0 +I0906 13:34:47.880764 8316 layer_factory.hpp:74] Creating layer data +I0906 13:34:47.880786 8316 net.cpp:91] Creating Layer data +I0906 13:34:47.880797 8316 net.cpp:369] data -> data +I0906 13:34:47.880820 8316 net.cpp:369] data -> label +I0906 13:34:47.880832 8316 net.cpp:121] Setting up data +I0906 13:34:47.880839 8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:34:47.890487 8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb +I0906 13:34:47.890738 8316 data_layer.cpp:53] output data size: 50,3,227,227 +I0906 13:34:47.907624 8316 base_data_layer.cpp:43] Initializing prefetch +I0906 13:34:47.907733 8316 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:34:47.907762 8316 net.cpp:128] Top shape: 50 3 227 227 (7729350) +I0906 13:34:47.907769 8316 net.cpp:128] Top shape: 50 (50) +I0906 13:34:47.907773 8316 net.cpp:134] Memory required for data: 30917600 +I0906 13:34:47.907805 8316 layer_factory.hpp:74] Creating layer label_data_1_split +I0906 13:34:47.907896 8316 net.cpp:91] Creating Layer label_data_1_split +I0906 13:34:47.907917 8316 net.cpp:411] label_data_1_split <- label +I0906 13:34:47.907979 8316 net.cpp:369] label_data_1_split -> label_data_1_split_0 +I0906 13:34:47.908016 8316 net.cpp:369] label_data_1_split -> label_data_1_split_1 +I0906 13:34:47.908028 8316 net.cpp:121] Setting up label_data_1_split +I0906 13:34:47.908057 8316 net.cpp:128] Top shape: 50 (50) +I0906 13:34:47.908064 8316 net.cpp:128] Top shape: 50 (50) +I0906 13:34:47.908068 8316 net.cpp:134] Memory required for data: 30918000 +I0906 13:34:47.908073 8316 layer_factory.hpp:74] Creating layer conv1 +I0906 13:34:47.908112 8316 net.cpp:91] Creating Layer conv1 +I0906 13:34:47.908118 8316 net.cpp:411] conv1 <- data +I0906 13:34:47.908133 8316 net.cpp:369] conv1 -> conv1 +I0906 13:34:47.908148 8316 net.cpp:121] Setting up conv1 +I0906 13:34:47.912806 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:47.912811 8316 net.cpp:134] Memory required for data: 88998000 +I0906 13:34:47.912832 8316 layer_factory.hpp:74] Creating layer relu1 +I0906 13:34:47.912844 8316 net.cpp:91] Creating Layer relu1 +I0906 13:34:47.912850 8316 net.cpp:411] relu1 <- conv1 +I0906 13:34:47.912863 8316 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:34:47.912873 8316 net.cpp:121] Setting up relu1 +I0906 13:34:47.912880 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:47.912883 8316 net.cpp:134] Memory required for data: 147078000 +I0906 13:34:47.912889 8316 layer_factory.hpp:74] Creating layer norm1 +I0906 13:34:47.912907 8316 net.cpp:91] Creating Layer norm1 +I0906 13:34:47.912912 8316 net.cpp:411] norm1 <- conv1 +I0906 13:34:47.912925 8316 net.cpp:369] norm1 -> norm1 +I0906 13:34:47.912936 8316 net.cpp:121] Setting up norm1 +I0906 13:34:47.912955 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:34:47.912999 8316 net.cpp:134] Memory required for data: 205158000 +I0906 13:34:47.913004 8316 layer_factory.hpp:74] Creating layer pool1 +I0906 13:34:47.913022 8316 net.cpp:91] Creating Layer pool1 +I0906 13:34:47.913027 8316 net.cpp:411] pool1 <- norm1 +I0906 13:34:47.913040 8316 net.cpp:369] pool1 -> pool1 +I0906 13:34:47.913050 8316 net.cpp:121] Setting up pool1 +I0906 13:34:47.913069 8316 net.cpp:128] Top shape: 50 96 27 27 (3499200) +I0906 13:34:47.913074 8316 net.cpp:134] Memory required for data: 219154800 +I0906 13:34:47.913079 8316 layer_factory.hpp:74] Creating layer conv2 +I0906 13:34:47.913091 8316 net.cpp:91] Creating Layer conv2 +I0906 13:34:47.913096 8316 net.cpp:411] conv2 <- pool1 +I0906 13:34:47.913111 8316 net.cpp:369] conv2 -> conv2 +I0906 13:34:47.913123 8316 net.cpp:121] Setting up conv2 +I0906 13:34:47.952414 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:47.952428 8316 net.cpp:134] Memory required for data: 256479600 +I0906 13:34:47.952455 8316 layer_factory.hpp:74] Creating layer relu2 +I0906 13:34:47.952477 8316 net.cpp:91] Creating Layer relu2 +I0906 13:34:47.952487 8316 net.cpp:411] relu2 <- conv2 +I0906 13:34:47.952507 8316 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:34:47.952518 8316 net.cpp:121] Setting up relu2 +I0906 13:34:47.952527 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:47.952532 8316 net.cpp:134] Memory required for data: 293804400 +I0906 13:34:47.952536 8316 layer_factory.hpp:74] Creating layer norm2 +I0906 13:34:47.952558 8316 net.cpp:91] Creating Layer norm2 +I0906 13:34:47.952564 8316 net.cpp:411] norm2 <- conv2 +I0906 13:34:47.952577 8316 net.cpp:369] norm2 -> norm2 +I0906 13:34:47.952591 8316 net.cpp:121] Setting up norm2 +I0906 13:34:47.952610 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:34:47.952615 8316 net.cpp:134] Memory required for data: 331129200 +I0906 13:34:47.952620 8316 layer_factory.hpp:74] Creating layer pool2 +I0906 13:34:47.952635 8316 net.cpp:91] Creating Layer pool2 +I0906 13:34:47.952641 8316 net.cpp:411] pool2 <- norm2 +I0906 13:34:47.952653 8316 net.cpp:369] pool2 -> pool2 +I0906 13:34:47.952663 8316 net.cpp:121] Setting up pool2 +I0906 13:34:47.952682 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:47.952685 8316 net.cpp:134] Memory required for data: 339782000 +I0906 13:34:47.952690 8316 layer_factory.hpp:74] Creating layer conv3 +I0906 13:34:47.952713 8316 net.cpp:91] Creating Layer conv3 +I0906 13:34:47.952718 8316 net.cpp:411] conv3 <- pool2 +I0906 13:34:47.952733 8316 net.cpp:369] conv3 -> conv3 +I0906 13:34:47.952744 8316 net.cpp:121] Setting up conv3 +I0906 13:34:48.002686 8321 data_layer.cpp:120] Prefetch batch: 94 ms. +I0906 13:34:48.002718 8321 data_layer.cpp:121] Read time: 12.003 ms. +I0906 13:34:48.002725 8321 data_layer.cpp:122] Transform time: 81.802 ms. +I0906 13:34:48.066742 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:48.066764 8316 net.cpp:134] Memory required for data: 352761200 +I0906 13:34:48.066805 8316 layer_factory.hpp:74] Creating layer relu3 +I0906 13:34:48.066839 8316 net.cpp:91] Creating Layer relu3 +I0906 13:34:48.066854 8316 net.cpp:411] relu3 <- conv3 +I0906 13:34:48.066880 8316 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:34:48.066897 8316 net.cpp:121] Setting up relu3 +I0906 13:34:48.066906 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:48.066910 8316 net.cpp:134] Memory required for data: 365740400 +I0906 13:34:48.066915 8316 layer_factory.hpp:74] Creating layer conv4 +I0906 13:34:48.066942 8316 net.cpp:91] Creating Layer conv4 +I0906 13:34:48.066947 8316 net.cpp:411] conv4 <- conv3 +I0906 13:34:48.066964 8316 net.cpp:369] conv4 -> conv4 +I0906 13:34:48.066979 8316 net.cpp:121] Setting up conv4 +I0906 13:34:48.151291 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:48.151312 8316 net.cpp:134] Memory required for data: 378719600 +I0906 13:34:48.151340 8316 layer_factory.hpp:74] Creating layer relu4 +I0906 13:34:48.151372 8316 net.cpp:91] Creating Layer relu4 +I0906 13:34:48.151430 8316 net.cpp:411] relu4 <- conv4 +I0906 13:34:48.151458 8316 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:34:48.151473 8316 net.cpp:121] Setting up relu4 +I0906 13:34:48.151482 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:34:48.151486 8316 net.cpp:134] Memory required for data: 391698800 +I0906 13:34:48.151491 8316 layer_factory.hpp:74] Creating layer conv5 +I0906 13:34:48.151517 8316 net.cpp:91] Creating Layer conv5 +I0906 13:34:48.151523 8316 net.cpp:411] conv5 <- conv4 +I0906 13:34:48.151540 8316 net.cpp:369] conv5 -> conv5 +I0906 13:34:48.151554 8316 net.cpp:121] Setting up conv5 +I0906 13:34:48.208228 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:48.208250 8316 net.cpp:134] Memory required for data: 400351600 +I0906 13:34:48.208292 8316 layer_factory.hpp:74] Creating layer relu5 +I0906 13:34:48.208322 8316 net.cpp:91] Creating Layer relu5 +I0906 13:34:48.208336 8316 net.cpp:411] relu5 <- conv5 +I0906 13:34:48.208360 8316 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:34:48.208376 8316 net.cpp:121] Setting up relu5 +I0906 13:34:48.208385 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:34:48.208389 8316 net.cpp:134] Memory required for data: 409004400 +I0906 13:34:48.208395 8316 layer_factory.hpp:74] Creating layer pool5 +I0906 13:34:48.208425 8316 net.cpp:91] Creating Layer pool5 +I0906 13:34:48.208431 8316 net.cpp:411] pool5 <- conv5 +I0906 13:34:48.208446 8316 net.cpp:369] pool5 -> pool5 +I0906 13:34:48.208459 8316 net.cpp:121] Setting up pool5 +I0906 13:34:48.208479 8316 net.cpp:128] Top shape: 50 256 6 6 (460800) +I0906 13:34:48.208483 8316 net.cpp:134] Memory required for data: 410847600 +I0906 13:34:48.208488 8316 layer_factory.hpp:74] Creating layer fc6 +I0906 13:34:48.208510 8316 net.cpp:91] Creating Layer fc6 +I0906 13:34:48.208516 8316 net.cpp:411] fc6 <- pool5 +I0906 13:34:48.208530 8316 net.cpp:369] fc6 -> fc6 +I0906 13:34:48.208544 8316 net.cpp:121] Setting up fc6 +I0906 13:34:52.951850 8316 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:52.951876 8316 net.cpp:134] Memory required for data: 411666800 +I0906 13:34:52.951903 8316 layer_factory.hpp:74] Creating layer relu6 +I0906 13:34:52.951944 8316 net.cpp:91] Creating Layer relu6 +I0906 13:34:52.951961 8316 net.cpp:411] relu6 <- fc6 +I0906 13:34:52.951987 8316 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:34:52.952003 8316 net.cpp:121] Setting up relu6 +I0906 13:34:52.952010 8316 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:52.952014 8316 net.cpp:134] Memory required for data: 412486000 +I0906 13:34:52.952019 8316 layer_factory.hpp:74] Creating layer fc7 +I0906 13:34:52.952044 8316 net.cpp:91] Creating Layer fc7 +I0906 13:34:52.952049 8316 net.cpp:411] fc7 <- fc6 +I0906 13:34:52.952065 8316 net.cpp:369] fc7 -> fc7 +I0906 13:34:52.952080 8316 net.cpp:121] Setting up fc7 +I0906 13:34:55.059911 8316 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:55.059948 8316 net.cpp:134] Memory required for data: 413305200 +I0906 13:34:55.059976 8316 layer_factory.hpp:74] Creating layer relu7 +I0906 13:34:55.060010 8316 net.cpp:91] Creating Layer relu7 +I0906 13:34:55.060025 8316 net.cpp:411] relu7 <- fc7 +I0906 13:34:55.060053 8316 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:34:55.060070 8316 net.cpp:121] Setting up relu7 +I0906 13:34:55.060078 8316 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:34:55.060082 8316 net.cpp:134] Memory required for data: 414124400 +I0906 13:34:55.060087 8316 layer_factory.hpp:74] Creating layer fc8 +I0906 13:34:55.060109 8316 net.cpp:91] Creating Layer fc8 +I0906 13:34:55.060116 8316 net.cpp:411] fc8 <- fc7 +I0906 13:34:55.060132 8316 net.cpp:369] fc8 -> fc8 +I0906 13:34:55.060156 8316 net.cpp:121] Setting up fc8 +I0906 13:34:55.576926 8316 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:55.576946 8316 net.cpp:134] Memory required for data: 414324400 +I0906 13:34:55.576972 8316 layer_factory.hpp:74] Creating layer fc8_fc8_0_split +I0906 13:34:55.577006 8316 net.cpp:91] Creating Layer fc8_fc8_0_split +I0906 13:34:55.577097 8316 net.cpp:411] fc8_fc8_0_split <- fc8 +I0906 13:34:55.577136 8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 +I0906 13:34:55.577162 8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 +I0906 13:34:55.577173 8316 net.cpp:121] Setting up fc8_fc8_0_split +I0906 13:34:55.577191 8316 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:55.577198 8316 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:34:55.577201 8316 net.cpp:134] Memory required for data: 414724400 +I0906 13:34:55.577206 8316 layer_factory.hpp:74] Creating layer accuracy +I0906 13:34:55.577237 8316 net.cpp:91] Creating Layer accuracy +I0906 13:34:55.577244 8316 net.cpp:411] accuracy <- fc8_fc8_0_split_0 +I0906 13:34:55.577255 8316 net.cpp:411] accuracy <- label_data_1_split_0 +I0906 13:34:55.577266 8316 net.cpp:369] accuracy -> accuracy +I0906 13:34:55.577277 8316 net.cpp:121] Setting up accuracy +I0906 13:34:55.577293 8316 net.cpp:128] Top shape: (1) +I0906 13:34:55.577297 8316 net.cpp:134] Memory required for data: 414724404 +I0906 13:34:55.577302 8316 layer_factory.hpp:74] Creating layer loss +I0906 13:34:55.577314 8316 net.cpp:91] Creating Layer loss +I0906 13:34:55.577321 8316 net.cpp:411] loss <- fc8_fc8_0_split_1 +I0906 13:34:55.577332 8316 net.cpp:411] loss <- label_data_1_split_1 +I0906 13:34:55.577342 8316 net.cpp:369] loss -> loss +I0906 13:34:55.577353 8316 net.cpp:121] Setting up loss +I0906 13:34:55.577363 8316 layer_factory.hpp:74] Creating layer loss +I0906 13:34:55.577759 8316 net.cpp:128] Top shape: (1) +I0906 13:34:55.577764 8316 net.cpp:130] with loss weight 1 +I0906 13:34:55.577780 8316 net.cpp:134] Memory required for data: 414724408 +I0906 13:34:55.577786 8316 net.cpp:193] loss needs backward computation. +I0906 13:34:55.577795 8316 net.cpp:195] accuracy does not need backward computation. +I0906 13:34:55.577801 8316 net.cpp:193] fc8_fc8_0_split needs backward computation. +I0906 13:34:55.577807 8316 net.cpp:193] fc8 needs backward computation. +I0906 13:34:55.577813 8316 net.cpp:193] relu7 needs backward computation. +I0906 13:34:55.577818 8316 net.cpp:193] fc7 needs backward computation. +I0906 13:34:55.577824 8316 net.cpp:193] relu6 needs backward computation. +I0906 13:34:55.577831 8316 net.cpp:193] fc6 needs backward computation. +I0906 13:34:55.577836 8316 net.cpp:193] pool5 needs backward computation. +I0906 13:34:55.577842 8316 net.cpp:193] relu5 needs backward computation. +I0906 13:34:55.577847 8316 net.cpp:193] conv5 needs backward computation. +I0906 13:34:55.577853 8316 net.cpp:193] relu4 needs backward computation. +I0906 13:34:55.577859 8316 net.cpp:193] conv4 needs backward computation. +I0906 13:34:55.577864 8316 net.cpp:193] relu3 needs backward computation. +I0906 13:34:55.577870 8316 net.cpp:193] conv3 needs backward computation. +I0906 13:34:55.577877 8316 net.cpp:193] pool2 needs backward computation. +I0906 13:34:55.577883 8316 net.cpp:193] norm2 needs backward computation. +I0906 13:34:55.577888 8316 net.cpp:193] relu2 needs backward computation. +I0906 13:34:55.577893 8316 net.cpp:193] conv2 needs backward computation. +I0906 13:34:55.577899 8316 net.cpp:193] pool1 needs backward computation. +I0906 13:34:55.577905 8316 net.cpp:193] norm1 needs backward computation. +I0906 13:34:55.577911 8316 net.cpp:193] relu1 needs backward computation. +I0906 13:34:55.577916 8316 net.cpp:193] conv1 needs backward computation. +I0906 13:34:55.577924 8316 net.cpp:195] label_data_1_split does not need backward computation. +I0906 13:34:55.577931 8316 net.cpp:195] data does not need backward computation. +I0906 13:34:55.577936 8316 net.cpp:236] This network produces output accuracy +I0906 13:34:55.577942 8316 net.cpp:236] This network produces output loss +I0906 13:34:55.577977 8316 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:34:55.577991 8316 net.cpp:248] Network initialization done. +I0906 13:34:55.577996 8316 net.cpp:249] Memory required for data: 414724408 +I0906 13:34:55.578182 8316 solver.cpp:53] Solver scaffolding done. +I0906 13:34:55.578306 8316 solver.cpp:270] Solving AlexNet +I0906 13:34:55.578330 8316 solver.cpp:271] Learning Rate Policy: step +I0906 13:34:55.580096 8316 solver.cpp:314] Iteration 0, Testing net (#0) +I0906 13:34:55.580111 8316 net.cpp:696] Copying source layer data +I0906 13:34:55.580116 8316 net.cpp:696] Copying source layer conv1 +I0906 13:34:55.583168 8316 net.cpp:696] Copying source layer relu1 +I0906 13:34:55.583199 8316 net.cpp:696] Copying source layer norm1 +I0906 13:34:55.583204 8316 net.cpp:696] Copying source layer pool1 +I0906 13:34:55.583209 8316 net.cpp:696] Copying source layer conv2 +I0906 13:34:55.583320 8316 net.cpp:696] Copying source layer relu2 +I0906 13:34:55.583326 8316 net.cpp:696] Copying source layer norm2 +I0906 13:34:55.583331 8316 net.cpp:696] Copying source layer pool2 +I0906 13:34:55.583335 8316 net.cpp:696] Copying source layer conv3 +I0906 13:34:55.583690 8316 net.cpp:696] Copying source layer relu3 +I0906 13:34:55.583698 8316 net.cpp:696] Copying source layer conv4 +I0906 13:34:55.583895 8316 net.cpp:696] Copying source layer relu4 +I0906 13:34:55.583902 8316 net.cpp:696] Copying source layer conv5 +I0906 13:34:55.584177 8316 net.cpp:696] Copying source layer relu5 +I0906 13:34:55.584185 8316 net.cpp:696] Copying source layer pool5 +I0906 13:34:55.584189 8316 net.cpp:696] Copying source layer fc6 +I0906 13:34:55.589432 8316 net.cpp:696] Copying source layer relu6 +I0906 13:34:55.589460 8316 net.cpp:696] Copying source layer fc7 +I0906 13:34:55.592273 8316 net.cpp:696] Copying source layer relu7 +I0906 13:34:55.592288 8316 net.cpp:696] Copying source layer fc8 +I0906 13:34:55.593138 8316 net.cpp:696] Copying source layer loss +I0906 13:34:55.593260 8316 base_data_layer.cpp:89] Thread joined +I0906 13:34:55.597589 8316 base_data_layer.cpp:93] Prefetch copied +I0906 13:34:55.597887 8316 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:34:55.695569 8322 data_layer.cpp:120] Prefetch batch: 97 ms. +I0906 13:34:55.695600 8322 data_layer.cpp:121] Read time: 13.209 ms. +I0906 13:34:55.695606 8322 data_layer.cpp:122] Transform time: 83.025 ms. +I0906 13:34:58.623245 8316 solver.cpp:363] Test net output #0: accuracy = 0 +I0906 13:34:58.623273 8316 solver.cpp:363] Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss) +I0906 13:34:58.623322 8316 base_data_layer.cpp:89] Thread joined +I0906 13:34:58.632244 8316 base_data_layer.cpp:93] Prefetch copied +I0906 13:34:58.632606 8316 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:34:58.819707 8323 data_layer.cpp:120] Prefetch batch: 186 ms. +I0906 13:34:58.819741 8323 data_layer.cpp:121] Read time: 24.148 ms. +I0906 13:34:58.819747 8323 data_layer.cpp:122] Transform time: 161.152 ms. +I0906 13:35:05.407784 8316 solver.cpp:234] Iteration 0, loss = 0 +I0906 13:35:05.407842 8316 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) +I0906 13:35:05.407891 8316 solver.cpp:506] Iteration 0, lr = 0.01 +I0906 13:35:05.525874 8316 base_data_layer.cpp:89] Thread joined +I0906 13:35:05.533869 8316 base_data_layer.cpp:93] Prefetch copied +I0906 13:35:05.534140 8316 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:35:05.722632 8328 data_layer.cpp:120] Prefetch batch: 188 ms. +I0906 13:35:05.722664 8328 data_layer.cpp:121] Read time: 24.184 ms. +I0906 13:35:05.722672 8328 data_layer.cpp:122] Transform time: 162.257 ms. +I0906 13:35:08.300590 8316 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 new file mode 100644 index 00000000..6ec81c82 --- /dev/null +++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 @@ -0,0 +1,1160 @@ +Log file created at: 2015/09/06 13:58:05 +Running on machine: AMD-RESEARCH +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0906 13:58:05.835170 16515 caffe.cpp:114] Use GPU with device ID 0 +I0906 13:58:05.875704 16515 device.cpp:230] Number of platforms found:1 +I0906 13:58:05.875743 16515 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing +I0906 13:58:05.875757 16515 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE +I0906 13:58:05.875763 16515 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) +I0906 13:58:05.875769 16515 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. +I0906 13:58:05.875774 16515 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices +I0906 13:58:05.875783 16515 device.cpp:286] Number of devices found:1 +I0906 13:58:05.875788 16515 device.cpp:288] DeviceID: 0x18ab2f0 +I0906 13:58:05.875809 16515 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU +I0906 13:58:05.875818 16515 device.cpp:393] Is it integrated GPU?: 0 +I0906 13:58:05.875823 16515 device.cpp:393] Max clock frequency MHz: 930 +I0906 13:58:05.875829 16515 device.cpp:393] Host-Device unified mem: 0 +I0906 13:58:05.875834 16515 device.cpp:393] ECC support: 0 +I0906 13:58:05.875839 16515 device.cpp:393] Endian little: 1 +I0906 13:58:05.875844 16515 device.cpp:393] Max compute units: 44 +I0906 13:58:05.875849 16515 device.cpp:393] Max work group size: 256 +I0906 13:58:05.875856 16515 device.cpp:393] Max work item dimensions: 3 +I0906 13:58:05.875862 16515 device.cpp:393] Max work item sizes: 0x100 +I0906 13:58:05.875869 16515 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE +I0906 13:58:05.875875 16515 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL +I0906 13:58:05.875881 16515 device.cpp:393] Max mem alloc size: 4244635648 +I0906 13:58:05.875886 16515 device.cpp:393] Global mem size: 16878927872 +I0906 13:58:05.875891 16515 device.cpp:393] Local mem size: 32768 +I0906 13:58:05.875902 16515 device.cpp:96] Picked device type : GPU 0 +I0906 13:58:08.267483 16515 device.cpp:152] Build Program +I0906 13:58:08.267706 16515 caffe.cpp:122] Starting Optimization +I0906 13:58:08.267797 16515 solver.cpp:40] Initializing solver from parameters: +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +display: 1 +max_iter: 10 +lr_policy: "step" +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +stepsize: 100000 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +I0906 13:58:08.267910 16515 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:58:08.269042 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data +I0906 13:58:08.269093 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy +I0906 13:58:08.269273 16515 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TRAIN +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 100 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:58:08.269708 16515 net.cpp:68] Memory required for data: 0 +I0906 13:58:08.269917 16515 layer_factory.hpp:74] Creating layer data +I0906 13:58:08.269971 16515 net.cpp:91] Creating Layer data +I0906 13:58:08.269992 16515 net.cpp:369] data -> data +I0906 13:58:08.270097 16515 net.cpp:369] data -> label +I0906 13:58:08.270122 16515 net.cpp:121] Setting up data +I0906 13:58:08.270134 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:58:08.279337 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb +I0906 13:58:08.279680 16515 data_layer.cpp:53] output data size: 100,3,227,227 +I0906 13:58:08.311036 16515 base_data_layer.cpp:43] Initializing prefetch +I0906 13:58:08.311240 16515 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:58:08.311303 16515 net.cpp:128] Top shape: 100 3 227 227 (15458700) +I0906 13:58:08.311313 16515 net.cpp:128] Top shape: 100 (100) +I0906 13:58:08.311318 16515 net.cpp:134] Memory required for data: 61835200 +I0906 13:58:08.311352 16515 layer_factory.hpp:74] Creating layer conv1 +I0906 13:58:08.311431 16515 net.cpp:91] Creating Layer conv1 +I0906 13:58:08.311453 16515 net.cpp:411] conv1 <- data +I0906 13:58:08.311504 16515 net.cpp:369] conv1 -> conv1 +I0906 13:58:08.311569 16515 net.cpp:121] Setting up conv1 +I0906 13:58:08.316509 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:08.316515 16515 net.cpp:134] Memory required for data: 177995200 +I0906 13:58:08.316555 16515 layer_factory.hpp:74] Creating layer relu1 +I0906 13:58:08.316577 16515 net.cpp:91] Creating Layer relu1 +I0906 13:58:08.316583 16515 net.cpp:411] relu1 <- conv1 +I0906 13:58:08.316597 16515 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:58:08.316606 16515 net.cpp:121] Setting up relu1 +I0906 13:58:08.316615 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:08.316619 16515 net.cpp:134] Memory required for data: 294155200 +I0906 13:58:08.316623 16515 layer_factory.hpp:74] Creating layer norm1 +I0906 13:58:08.316653 16515 net.cpp:91] Creating Layer norm1 +I0906 13:58:08.316659 16515 net.cpp:411] norm1 <- conv1 +I0906 13:58:08.316673 16515 net.cpp:369] norm1 -> norm1 +I0906 13:58:08.316686 16515 net.cpp:121] Setting up norm1 +I0906 13:58:08.316710 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:08.316715 16515 net.cpp:134] Memory required for data: 410315200 +I0906 13:58:08.316720 16515 layer_factory.hpp:74] Creating layer pool1 +I0906 13:58:08.316745 16515 net.cpp:91] Creating Layer pool1 +I0906 13:58:08.316750 16515 net.cpp:411] pool1 <- norm1 +I0906 13:58:08.316763 16515 net.cpp:369] pool1 -> pool1 +I0906 13:58:08.316776 16515 net.cpp:121] Setting up pool1 +I0906 13:58:08.316805 16515 net.cpp:128] Top shape: 100 96 27 27 (6998400) +I0906 13:58:08.316809 16515 net.cpp:134] Memory required for data: 438308800 +I0906 13:58:08.316814 16515 layer_factory.hpp:74] Creating layer conv2 +I0906 13:58:08.316829 16515 net.cpp:91] Creating Layer conv2 +I0906 13:58:08.316834 16515 net.cpp:411] conv2 <- pool1 +I0906 13:58:08.316850 16515 net.cpp:369] conv2 -> conv2 +I0906 13:58:08.316862 16515 net.cpp:121] Setting up conv2 +I0906 13:58:08.356899 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:08.356914 16515 net.cpp:134] Memory required for data: 512958400 +I0906 13:58:08.356945 16515 layer_factory.hpp:74] Creating layer relu2 +I0906 13:58:08.356967 16515 net.cpp:91] Creating Layer relu2 +I0906 13:58:08.356978 16515 net.cpp:411] relu2 <- conv2 +I0906 13:58:08.356998 16515 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:58:08.357012 16515 net.cpp:121] Setting up relu2 +I0906 13:58:08.357022 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:08.357025 16515 net.cpp:134] Memory required for data: 587608000 +I0906 13:58:08.357030 16515 layer_factory.hpp:74] Creating layer norm2 +I0906 13:58:08.357046 16515 net.cpp:91] Creating Layer norm2 +I0906 13:58:08.357053 16515 net.cpp:411] norm2 <- conv2 +I0906 13:58:08.357066 16515 net.cpp:369] norm2 -> norm2 +I0906 13:58:08.357079 16515 net.cpp:121] Setting up norm2 +I0906 13:58:08.357108 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:08.357113 16515 net.cpp:134] Memory required for data: 662257600 +I0906 13:58:08.357118 16515 layer_factory.hpp:74] Creating layer pool2 +I0906 13:58:08.357146 16515 net.cpp:91] Creating Layer pool2 +I0906 13:58:08.357152 16515 net.cpp:411] pool2 <- norm2 +I0906 13:58:08.357166 16515 net.cpp:369] pool2 -> pool2 +I0906 13:58:08.357177 16515 net.cpp:121] Setting up pool2 +I0906 13:58:08.357200 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:08.357204 16515 net.cpp:134] Memory required for data: 679563200 +I0906 13:58:08.357259 16515 layer_factory.hpp:74] Creating layer conv3 +I0906 13:58:08.357281 16515 net.cpp:91] Creating Layer conv3 +I0906 13:58:08.357287 16515 net.cpp:411] conv3 <- pool2 +I0906 13:58:08.357303 16515 net.cpp:369] conv3 -> conv3 +I0906 13:58:08.357318 16515 net.cpp:121] Setting up conv3 +I0906 13:58:08.475977 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:08.475999 16515 net.cpp:134] Memory required for data: 705521600 +I0906 13:58:08.476043 16515 layer_factory.hpp:74] Creating layer relu3 +I0906 13:58:08.476078 16515 net.cpp:91] Creating Layer relu3 +I0906 13:58:08.476093 16515 net.cpp:411] relu3 <- conv3 +I0906 13:58:08.476120 16515 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:58:08.476137 16515 net.cpp:121] Setting up relu3 +I0906 13:58:08.476147 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:08.476151 16515 net.cpp:134] Memory required for data: 731480000 +I0906 13:58:08.476156 16515 layer_factory.hpp:74] Creating layer conv4 +I0906 13:58:08.476184 16515 net.cpp:91] Creating Layer conv4 +I0906 13:58:08.476191 16515 net.cpp:411] conv4 <- conv3 +I0906 13:58:08.476207 16515 net.cpp:369] conv4 -> conv4 +I0906 13:58:08.476222 16515 net.cpp:121] Setting up conv4 +I0906 13:58:08.500998 16519 data_layer.cpp:120] Prefetch batch: 189 ms. +I0906 13:58:08.501045 16519 data_layer.cpp:121] Read time: 23.893 ms. +I0906 13:58:08.501054 16519 data_layer.cpp:122] Transform time: 163.51 ms. +I0906 13:58:08.563753 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:08.563774 16515 net.cpp:134] Memory required for data: 757438400 +I0906 13:58:08.563802 16515 layer_factory.hpp:74] Creating layer relu4 +I0906 13:58:08.563835 16515 net.cpp:91] Creating Layer relu4 +I0906 13:58:08.563849 16515 net.cpp:411] relu4 <- conv4 +I0906 13:58:08.563876 16515 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:58:08.563892 16515 net.cpp:121] Setting up relu4 +I0906 13:58:08.563902 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:08.563906 16515 net.cpp:134] Memory required for data: 783396800 +I0906 13:58:08.563911 16515 layer_factory.hpp:74] Creating layer conv5 +I0906 13:58:08.563946 16515 net.cpp:91] Creating Layer conv5 +I0906 13:58:08.563951 16515 net.cpp:411] conv5 <- conv4 +I0906 13:58:08.563968 16515 net.cpp:369] conv5 -> conv5 +I0906 13:58:08.563982 16515 net.cpp:121] Setting up conv5 +I0906 13:58:08.621495 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:08.621512 16515 net.cpp:134] Memory required for data: 800702400 +I0906 13:58:08.621553 16515 layer_factory.hpp:74] Creating layer relu5 +I0906 13:58:08.621584 16515 net.cpp:91] Creating Layer relu5 +I0906 13:58:08.621598 16515 net.cpp:411] relu5 <- conv5 +I0906 13:58:08.621623 16515 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:58:08.621639 16515 net.cpp:121] Setting up relu5 +I0906 13:58:08.621649 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:08.621652 16515 net.cpp:134] Memory required for data: 818008000 +I0906 13:58:08.621657 16515 layer_factory.hpp:74] Creating layer pool5 +I0906 13:58:08.621677 16515 net.cpp:91] Creating Layer pool5 +I0906 13:58:08.621683 16515 net.cpp:411] pool5 <- conv5 +I0906 13:58:08.621697 16515 net.cpp:369] pool5 -> pool5 +I0906 13:58:08.621711 16515 net.cpp:121] Setting up pool5 +I0906 13:58:08.621732 16515 net.cpp:128] Top shape: 100 256 6 6 (921600) +I0906 13:58:08.621737 16515 net.cpp:134] Memory required for data: 821694400 +I0906 13:58:08.621742 16515 layer_factory.hpp:74] Creating layer fc6 +I0906 13:58:08.621778 16515 net.cpp:91] Creating Layer fc6 +I0906 13:58:08.621783 16515 net.cpp:411] fc6 <- pool5 +I0906 13:58:08.621798 16515 net.cpp:369] fc6 -> fc6 +I0906 13:58:08.621812 16515 net.cpp:121] Setting up fc6 +I0906 13:58:13.492439 16515 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:58:13.492465 16515 net.cpp:134] Memory required for data: 823332800 +I0906 13:58:13.492493 16515 layer_factory.hpp:74] Creating layer relu6 +I0906 13:58:13.492527 16515 net.cpp:91] Creating Layer relu6 +I0906 13:58:13.492542 16515 net.cpp:411] relu6 <- fc6 +I0906 13:58:13.492568 16515 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:58:13.492630 16515 net.cpp:121] Setting up relu6 +I0906 13:58:13.492640 16515 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:58:13.492643 16515 net.cpp:134] Memory required for data: 824971200 +I0906 13:58:13.492648 16515 layer_factory.hpp:74] Creating layer fc7 +I0906 13:58:13.492671 16515 net.cpp:91] Creating Layer fc7 +I0906 13:58:13.492677 16515 net.cpp:411] fc7 <- fc6 +I0906 13:58:13.492693 16515 net.cpp:369] fc7 -> fc7 +I0906 13:58:13.492708 16515 net.cpp:121] Setting up fc7 +I0906 13:58:15.661120 16515 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:58:15.661144 16515 net.cpp:134] Memory required for data: 826609600 +I0906 13:58:15.661171 16515 layer_factory.hpp:74] Creating layer relu7 +I0906 13:58:15.661205 16515 net.cpp:91] Creating Layer relu7 +I0906 13:58:15.661221 16515 net.cpp:411] relu7 <- fc7 +I0906 13:58:15.661247 16515 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:58:15.661263 16515 net.cpp:121] Setting up relu7 +I0906 13:58:15.661273 16515 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:58:15.661276 16515 net.cpp:134] Memory required for data: 828248000 +I0906 13:58:15.661281 16515 layer_factory.hpp:74] Creating layer fc8 +I0906 13:58:15.661304 16515 net.cpp:91] Creating Layer fc8 +I0906 13:58:15.661310 16515 net.cpp:411] fc8 <- fc7 +I0906 13:58:15.661325 16515 net.cpp:369] fc8 -> fc8 +I0906 13:58:15.661340 16515 net.cpp:121] Setting up fc8 +I0906 13:58:16.190832 16515 net.cpp:128] Top shape: 100 1000 (100000) +I0906 13:58:16.190855 16515 net.cpp:134] Memory required for data: 828648000 +I0906 13:58:16.190881 16515 layer_factory.hpp:74] Creating layer loss +I0906 13:58:16.190932 16515 net.cpp:91] Creating Layer loss +I0906 13:58:16.190946 16515 net.cpp:411] loss <- fc8 +I0906 13:58:16.190969 16515 net.cpp:411] loss <- label +I0906 13:58:16.190989 16515 net.cpp:369] loss -> loss +I0906 13:58:16.191009 16515 net.cpp:121] Setting up loss +I0906 13:58:16.191030 16515 layer_factory.hpp:74] Creating layer loss +I0906 13:58:16.191588 16515 net.cpp:128] Top shape: (1) +I0906 13:58:16.191593 16515 net.cpp:130] with loss weight 1 +I0906 13:58:16.191611 16515 net.cpp:134] Memory required for data: 828648004 +I0906 13:58:16.191619 16515 net.cpp:193] loss needs backward computation. +I0906 13:58:16.191627 16515 net.cpp:193] fc8 needs backward computation. +I0906 13:58:16.191633 16515 net.cpp:193] relu7 needs backward computation. +I0906 13:58:16.191639 16515 net.cpp:193] fc7 needs backward computation. +I0906 13:58:16.191644 16515 net.cpp:193] relu6 needs backward computation. +I0906 13:58:16.191650 16515 net.cpp:193] fc6 needs backward computation. +I0906 13:58:16.191655 16515 net.cpp:193] pool5 needs backward computation. +I0906 13:58:16.191661 16515 net.cpp:193] relu5 needs backward computation. +I0906 13:58:16.191666 16515 net.cpp:193] conv5 needs backward computation. +I0906 13:58:16.191673 16515 net.cpp:193] relu4 needs backward computation. +I0906 13:58:16.191678 16515 net.cpp:193] conv4 needs backward computation. +I0906 13:58:16.191684 16515 net.cpp:193] relu3 needs backward computation. +I0906 13:58:16.191689 16515 net.cpp:193] conv3 needs backward computation. +I0906 13:58:16.191696 16515 net.cpp:193] pool2 needs backward computation. +I0906 13:58:16.191702 16515 net.cpp:193] norm2 needs backward computation. +I0906 13:58:16.191709 16515 net.cpp:193] relu2 needs backward computation. +I0906 13:58:16.191714 16515 net.cpp:193] conv2 needs backward computation. +I0906 13:58:16.191720 16515 net.cpp:193] pool1 needs backward computation. +I0906 13:58:16.191725 16515 net.cpp:193] norm1 needs backward computation. +I0906 13:58:16.191731 16515 net.cpp:193] relu1 needs backward computation. +I0906 13:58:16.191737 16515 net.cpp:193] conv1 needs backward computation. +I0906 13:58:16.191745 16515 net.cpp:195] data does not need backward computation. +I0906 13:58:16.191753 16515 net.cpp:236] This network produces output loss +I0906 13:58:16.191787 16515 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:58:16.191803 16515 net.cpp:248] Network initialization done. +I0906 13:58:16.191807 16515 net.cpp:249] Memory required for data: 828648004 +I0906 13:58:16.192769 16515 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:58:16.192881 16515 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data +I0906 13:58:16.193114 16515 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TEST +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:58:16.193480 16515 net.cpp:68] Memory required for data: 0 +I0906 13:58:16.193527 16515 layer_factory.hpp:74] Creating layer data +I0906 13:58:16.193549 16515 net.cpp:91] Creating Layer data +I0906 13:58:16.193559 16515 net.cpp:369] data -> data +I0906 13:58:16.193583 16515 net.cpp:369] data -> label +I0906 13:58:16.193595 16515 net.cpp:121] Setting up data +I0906 13:58:16.193603 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:58:16.202100 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb +I0906 13:58:16.202343 16515 data_layer.cpp:53] output data size: 50,3,227,227 +I0906 13:58:16.219017 16515 base_data_layer.cpp:43] Initializing prefetch +I0906 13:58:16.219137 16515 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:58:16.219171 16515 net.cpp:128] Top shape: 50 3 227 227 (7729350) +I0906 13:58:16.219179 16515 net.cpp:128] Top shape: 50 (50) +I0906 13:58:16.219183 16515 net.cpp:134] Memory required for data: 30917600 +I0906 13:58:16.219214 16515 layer_factory.hpp:74] Creating layer label_data_1_split +I0906 13:58:16.219279 16515 net.cpp:91] Creating Layer label_data_1_split +I0906 13:58:16.219293 16515 net.cpp:411] label_data_1_split <- label +I0906 13:58:16.219367 16515 net.cpp:369] label_data_1_split -> label_data_1_split_0 +I0906 13:58:16.219409 16515 net.cpp:369] label_data_1_split -> label_data_1_split_1 +I0906 13:58:16.219420 16515 net.cpp:121] Setting up label_data_1_split +I0906 13:58:16.219455 16515 net.cpp:128] Top shape: 50 (50) +I0906 13:58:16.219462 16515 net.cpp:128] Top shape: 50 (50) +I0906 13:58:16.219466 16515 net.cpp:134] Memory required for data: 30918000 +I0906 13:58:16.219471 16515 layer_factory.hpp:74] Creating layer conv1 +I0906 13:58:16.219508 16515 net.cpp:91] Creating Layer conv1 +I0906 13:58:16.219513 16515 net.cpp:411] conv1 <- data +I0906 13:58:16.219530 16515 net.cpp:369] conv1 -> conv1 +I0906 13:58:16.219545 16515 net.cpp:121] Setting up conv1 +I0906 13:58:16.224315 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:58:16.224321 16515 net.cpp:134] Memory required for data: 88998000 +I0906 13:58:16.224341 16515 layer_factory.hpp:74] Creating layer relu1 +I0906 13:58:16.224354 16515 net.cpp:91] Creating Layer relu1 +I0906 13:58:16.224360 16515 net.cpp:411] relu1 <- conv1 +I0906 13:58:16.224372 16515 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:58:16.224382 16515 net.cpp:121] Setting up relu1 +I0906 13:58:16.224390 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:58:16.224393 16515 net.cpp:134] Memory required for data: 147078000 +I0906 13:58:16.224398 16515 layer_factory.hpp:74] Creating layer norm1 +I0906 13:58:16.224417 16515 net.cpp:91] Creating Layer norm1 +I0906 13:58:16.224423 16515 net.cpp:411] norm1 <- conv1 +I0906 13:58:16.224436 16515 net.cpp:369] norm1 -> norm1 +I0906 13:58:16.224447 16515 net.cpp:121] Setting up norm1 +I0906 13:58:16.224465 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:58:16.224508 16515 net.cpp:134] Memory required for data: 205158000 +I0906 13:58:16.224514 16515 layer_factory.hpp:74] Creating layer pool1 +I0906 13:58:16.224529 16515 net.cpp:91] Creating Layer pool1 +I0906 13:58:16.224534 16515 net.cpp:411] pool1 <- norm1 +I0906 13:58:16.224547 16515 net.cpp:369] pool1 -> pool1 +I0906 13:58:16.224558 16515 net.cpp:121] Setting up pool1 +I0906 13:58:16.224576 16515 net.cpp:128] Top shape: 50 96 27 27 (3499200) +I0906 13:58:16.224581 16515 net.cpp:134] Memory required for data: 219154800 +I0906 13:58:16.224586 16515 layer_factory.hpp:74] Creating layer conv2 +I0906 13:58:16.224601 16515 net.cpp:91] Creating Layer conv2 +I0906 13:58:16.224606 16515 net.cpp:411] conv2 <- pool1 +I0906 13:58:16.224620 16515 net.cpp:369] conv2 -> conv2 +I0906 13:58:16.224632 16515 net.cpp:121] Setting up conv2 +I0906 13:58:16.264878 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:58:16.264889 16515 net.cpp:134] Memory required for data: 256479600 +I0906 13:58:16.264916 16515 layer_factory.hpp:74] Creating layer relu2 +I0906 13:58:16.264937 16515 net.cpp:91] Creating Layer relu2 +I0906 13:58:16.264946 16515 net.cpp:411] relu2 <- conv2 +I0906 13:58:16.264966 16515 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:58:16.264978 16515 net.cpp:121] Setting up relu2 +I0906 13:58:16.264987 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:58:16.264991 16515 net.cpp:134] Memory required for data: 293804400 +I0906 13:58:16.264997 16515 layer_factory.hpp:74] Creating layer norm2 +I0906 13:58:16.265015 16515 net.cpp:91] Creating Layer norm2 +I0906 13:58:16.265022 16515 net.cpp:411] norm2 <- conv2 +I0906 13:58:16.265035 16515 net.cpp:369] norm2 -> norm2 +I0906 13:58:16.265050 16515 net.cpp:121] Setting up norm2 +I0906 13:58:16.265072 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:58:16.265077 16515 net.cpp:134] Memory required for data: 331129200 +I0906 13:58:16.265082 16515 layer_factory.hpp:74] Creating layer pool2 +I0906 13:58:16.265097 16515 net.cpp:91] Creating Layer pool2 +I0906 13:58:16.265103 16515 net.cpp:411] pool2 <- norm2 +I0906 13:58:16.265116 16515 net.cpp:369] pool2 -> pool2 +I0906 13:58:16.265127 16515 net.cpp:121] Setting up pool2 +I0906 13:58:16.265149 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:58:16.265153 16515 net.cpp:134] Memory required for data: 339782000 +I0906 13:58:16.265158 16515 layer_factory.hpp:74] Creating layer conv3 +I0906 13:58:16.265179 16515 net.cpp:91] Creating Layer conv3 +I0906 13:58:16.265184 16515 net.cpp:411] conv3 <- pool2 +I0906 13:58:16.265200 16515 net.cpp:369] conv3 -> conv3 +I0906 13:58:16.265213 16515 net.cpp:121] Setting up conv3 +I0906 13:58:16.312928 16520 data_layer.cpp:120] Prefetch batch: 93 ms. +I0906 13:58:16.312959 16520 data_layer.cpp:121] Read time: 12.075 ms. +I0906 13:58:16.312966 16520 data_layer.cpp:122] Transform time: 80.513 ms. +I0906 13:58:16.381564 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:58:16.381587 16515 net.cpp:134] Memory required for data: 352761200 +I0906 13:58:16.381628 16515 layer_factory.hpp:74] Creating layer relu3 +I0906 13:58:16.381660 16515 net.cpp:91] Creating Layer relu3 +I0906 13:58:16.381675 16515 net.cpp:411] relu3 <- conv3 +I0906 13:58:16.381700 16515 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:58:16.381717 16515 net.cpp:121] Setting up relu3 +I0906 13:58:16.381726 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:58:16.381731 16515 net.cpp:134] Memory required for data: 365740400 +I0906 13:58:16.381734 16515 layer_factory.hpp:74] Creating layer conv4 +I0906 13:58:16.381762 16515 net.cpp:91] Creating Layer conv4 +I0906 13:58:16.381767 16515 net.cpp:411] conv4 <- conv3 +I0906 13:58:16.381783 16515 net.cpp:369] conv4 -> conv4 +I0906 13:58:16.381798 16515 net.cpp:121] Setting up conv4 +I0906 13:58:16.468471 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:58:16.468492 16515 net.cpp:134] Memory required for data: 378719600 +I0906 13:58:16.468518 16515 layer_factory.hpp:74] Creating layer relu4 +I0906 13:58:16.468550 16515 net.cpp:91] Creating Layer relu4 +I0906 13:58:16.468605 16515 net.cpp:411] relu4 <- conv4 +I0906 13:58:16.468633 16515 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:58:16.468649 16515 net.cpp:121] Setting up relu4 +I0906 13:58:16.468658 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:58:16.468662 16515 net.cpp:134] Memory required for data: 391698800 +I0906 13:58:16.468667 16515 layer_factory.hpp:74] Creating layer conv5 +I0906 13:58:16.468694 16515 net.cpp:91] Creating Layer conv5 +I0906 13:58:16.468700 16515 net.cpp:411] conv5 <- conv4 +I0906 13:58:16.468716 16515 net.cpp:369] conv5 -> conv5 +I0906 13:58:16.468731 16515 net.cpp:121] Setting up conv5 +I0906 13:58:16.526487 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:58:16.526507 16515 net.cpp:134] Memory required for data: 400351600 +I0906 13:58:16.526547 16515 layer_factory.hpp:74] Creating layer relu5 +I0906 13:58:16.526577 16515 net.cpp:91] Creating Layer relu5 +I0906 13:58:16.526590 16515 net.cpp:411] relu5 <- conv5 +I0906 13:58:16.526614 16515 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:58:16.526630 16515 net.cpp:121] Setting up relu5 +I0906 13:58:16.526639 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:58:16.526643 16515 net.cpp:134] Memory required for data: 409004400 +I0906 13:58:16.526648 16515 layer_factory.hpp:74] Creating layer pool5 +I0906 13:58:16.526676 16515 net.cpp:91] Creating Layer pool5 +I0906 13:58:16.526682 16515 net.cpp:411] pool5 <- conv5 +I0906 13:58:16.526696 16515 net.cpp:369] pool5 -> pool5 +I0906 13:58:16.526710 16515 net.cpp:121] Setting up pool5 +I0906 13:58:16.526731 16515 net.cpp:128] Top shape: 50 256 6 6 (460800) +I0906 13:58:16.526734 16515 net.cpp:134] Memory required for data: 410847600 +I0906 13:58:16.526739 16515 layer_factory.hpp:74] Creating layer fc6 +I0906 13:58:16.526762 16515 net.cpp:91] Creating Layer fc6 +I0906 13:58:16.526767 16515 net.cpp:411] fc6 <- pool5 +I0906 13:58:16.526782 16515 net.cpp:369] fc6 -> fc6 +I0906 13:58:16.526794 16515 net.cpp:121] Setting up fc6 +I0906 13:58:21.365124 16515 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:58:21.365149 16515 net.cpp:134] Memory required for data: 411666800 +I0906 13:58:21.365176 16515 layer_factory.hpp:74] Creating layer relu6 +I0906 13:58:21.365211 16515 net.cpp:91] Creating Layer relu6 +I0906 13:58:21.365226 16515 net.cpp:411] relu6 <- fc6 +I0906 13:58:21.365250 16515 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:58:21.365267 16515 net.cpp:121] Setting up relu6 +I0906 13:58:21.365277 16515 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:58:21.365280 16515 net.cpp:134] Memory required for data: 412486000 +I0906 13:58:21.365285 16515 layer_factory.hpp:74] Creating layer fc7 +I0906 13:58:21.365309 16515 net.cpp:91] Creating Layer fc7 +I0906 13:58:21.365314 16515 net.cpp:411] fc7 <- fc6 +I0906 13:58:21.365330 16515 net.cpp:369] fc7 -> fc7 +I0906 13:58:21.365345 16515 net.cpp:121] Setting up fc7 +I0906 13:58:23.510701 16515 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:58:23.510725 16515 net.cpp:134] Memory required for data: 413305200 +I0906 13:58:23.510752 16515 layer_factory.hpp:74] Creating layer relu7 +I0906 13:58:23.510785 16515 net.cpp:91] Creating Layer relu7 +I0906 13:58:23.510800 16515 net.cpp:411] relu7 <- fc7 +I0906 13:58:23.510828 16515 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:58:23.510844 16515 net.cpp:121] Setting up relu7 +I0906 13:58:23.510854 16515 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:58:23.510857 16515 net.cpp:134] Memory required for data: 414124400 +I0906 13:58:23.510862 16515 layer_factory.hpp:74] Creating layer fc8 +I0906 13:58:23.510885 16515 net.cpp:91] Creating Layer fc8 +I0906 13:58:23.510890 16515 net.cpp:411] fc8 <- fc7 +I0906 13:58:23.510906 16515 net.cpp:369] fc8 -> fc8 +I0906 13:58:23.510932 16515 net.cpp:121] Setting up fc8 +I0906 13:58:24.034812 16515 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:58:24.034833 16515 net.cpp:134] Memory required for data: 414324400 +I0906 13:58:24.034860 16515 layer_factory.hpp:74] Creating layer fc8_fc8_0_split +I0906 13:58:24.034893 16515 net.cpp:91] Creating Layer fc8_fc8_0_split +I0906 13:58:24.034958 16515 net.cpp:411] fc8_fc8_0_split <- fc8 +I0906 13:58:24.034988 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 +I0906 13:58:24.035012 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 +I0906 13:58:24.035023 16515 net.cpp:121] Setting up fc8_fc8_0_split +I0906 13:58:24.035040 16515 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:58:24.035046 16515 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:58:24.035050 16515 net.cpp:134] Memory required for data: 414724400 +I0906 13:58:24.035055 16515 layer_factory.hpp:74] Creating layer accuracy +I0906 13:58:24.035086 16515 net.cpp:91] Creating Layer accuracy +I0906 13:58:24.035092 16515 net.cpp:411] accuracy <- fc8_fc8_0_split_0 +I0906 13:58:24.035104 16515 net.cpp:411] accuracy <- label_data_1_split_0 +I0906 13:58:24.035115 16515 net.cpp:369] accuracy -> accuracy +I0906 13:58:24.035126 16515 net.cpp:121] Setting up accuracy +I0906 13:58:24.035143 16515 net.cpp:128] Top shape: (1) +I0906 13:58:24.035147 16515 net.cpp:134] Memory required for data: 414724404 +I0906 13:58:24.035152 16515 layer_factory.hpp:74] Creating layer loss +I0906 13:58:24.035163 16515 net.cpp:91] Creating Layer loss +I0906 13:58:24.035168 16515 net.cpp:411] loss <- fc8_fc8_0_split_1 +I0906 13:58:24.035179 16515 net.cpp:411] loss <- label_data_1_split_1 +I0906 13:58:24.035190 16515 net.cpp:369] loss -> loss +I0906 13:58:24.035202 16515 net.cpp:121] Setting up loss +I0906 13:58:24.035212 16515 layer_factory.hpp:74] Creating layer loss +I0906 13:58:24.035562 16515 net.cpp:128] Top shape: (1) +I0906 13:58:24.035567 16515 net.cpp:130] with loss weight 1 +I0906 13:58:24.035583 16515 net.cpp:134] Memory required for data: 414724408 +I0906 13:58:24.035591 16515 net.cpp:193] loss needs backward computation. +I0906 13:58:24.035598 16515 net.cpp:195] accuracy does not need backward computation. +I0906 13:58:24.035605 16515 net.cpp:193] fc8_fc8_0_split needs backward computation. +I0906 13:58:24.035610 16515 net.cpp:193] fc8 needs backward computation. +I0906 13:58:24.035616 16515 net.cpp:193] relu7 needs backward computation. +I0906 13:58:24.035621 16515 net.cpp:193] fc7 needs backward computation. +I0906 13:58:24.035627 16515 net.cpp:193] relu6 needs backward computation. +I0906 13:58:24.035634 16515 net.cpp:193] fc6 needs backward computation. +I0906 13:58:24.035640 16515 net.cpp:193] pool5 needs backward computation. +I0906 13:58:24.035645 16515 net.cpp:193] relu5 needs backward computation. +I0906 13:58:24.035651 16515 net.cpp:193] conv5 needs backward computation. +I0906 13:58:24.035656 16515 net.cpp:193] relu4 needs backward computation. +I0906 13:58:24.035662 16515 net.cpp:193] conv4 needs backward computation. +I0906 13:58:24.035668 16515 net.cpp:193] relu3 needs backward computation. +I0906 13:58:24.035673 16515 net.cpp:193] conv3 needs backward computation. +I0906 13:58:24.035679 16515 net.cpp:193] pool2 needs backward computation. +I0906 13:58:24.035686 16515 net.cpp:193] norm2 needs backward computation. +I0906 13:58:24.035692 16515 net.cpp:193] relu2 needs backward computation. +I0906 13:58:24.035697 16515 net.cpp:193] conv2 needs backward computation. +I0906 13:58:24.035703 16515 net.cpp:193] pool1 needs backward computation. +I0906 13:58:24.035709 16515 net.cpp:193] norm1 needs backward computation. +I0906 13:58:24.035715 16515 net.cpp:193] relu1 needs backward computation. +I0906 13:58:24.035720 16515 net.cpp:193] conv1 needs backward computation. +I0906 13:58:24.035727 16515 net.cpp:195] label_data_1_split does not need backward computation. +I0906 13:58:24.035734 16515 net.cpp:195] data does not need backward computation. +I0906 13:58:24.035739 16515 net.cpp:236] This network produces output accuracy +I0906 13:58:24.035745 16515 net.cpp:236] This network produces output loss +I0906 13:58:24.035781 16515 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:58:24.035796 16515 net.cpp:248] Network initialization done. +I0906 13:58:24.035799 16515 net.cpp:249] Memory required for data: 414724408 +I0906 13:58:24.036000 16515 solver.cpp:53] Solver scaffolding done. +I0906 13:58:24.036130 16515 solver.cpp \ No newline at end of file diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 new file mode 100644 index 00000000..d142f7c0 --- /dev/null +++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 @@ -0,0 +1,1208 @@ +Log file created at: 2015/09/06 13:58:55 +Running on machine: AMD-RESEARCH +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +I0906 13:58:55.707435 16537 caffe.cpp:114] Use GPU with device ID 0 +I0906 13:58:55.745967 16537 device.cpp:230] Number of platforms found:1 +I0906 13:58:55.746011 16537 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing +I0906 13:58:55.746028 16537 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE +I0906 13:58:55.746036 16537 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) +I0906 13:58:55.746042 16537 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. +I0906 13:58:55.746048 16537 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices +I0906 13:58:55.746059 16537 device.cpp:286] Number of devices found:1 +I0906 13:58:55.746064 16537 device.cpp:288] DeviceID: 0x18262f0 +I0906 13:58:55.746088 16537 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU +I0906 13:58:55.746098 16537 device.cpp:393] Is it integrated GPU?: 0 +I0906 13:58:55.746105 16537 device.cpp:393] Max clock frequency MHz: 930 +I0906 13:58:55.746111 16537 device.cpp:393] Host-Device unified mem: 0 +I0906 13:58:55.746117 16537 device.cpp:393] ECC support: 0 +I0906 13:58:55.746124 16537 device.cpp:393] Endian little: 1 +I0906 13:58:55.746130 16537 device.cpp:393] Max compute units: 44 +I0906 13:58:55.746136 16537 device.cpp:393] Max work group size: 256 +I0906 13:58:55.746145 16537 device.cpp:393] Max work item dimensions: 3 +I0906 13:58:55.746151 16537 device.cpp:393] Max work item sizes: 0x100 +I0906 13:58:55.746160 16537 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE +I0906 13:58:55.746167 16537 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL +I0906 13:58:55.746173 16537 device.cpp:393] Max mem alloc size: 4244635648 +I0906 13:58:55.746179 16537 device.cpp:393] Global mem size: 16878927872 +I0906 13:58:55.746186 16537 device.cpp:393] Local mem size: 32768 +I0906 13:58:55.746198 16537 device.cpp:96] Picked device type : GPU 0 +I0906 13:58:58.131669 16537 device.cpp:152] Build Program +I0906 13:58:58.131891 16537 caffe.cpp:122] Starting Optimization +I0906 13:58:58.132027 16537 solver.cpp:40] Initializing solver from parameters: +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +display: 1 +max_iter: 10 +lr_policy: "step" +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +stepsize: 100000 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +I0906 13:58:58.132150 16537 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:58:58.133236 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data +I0906 13:58:58.133285 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy +I0906 13:58:58.133460 16537 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TRAIN +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 100 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:58:58.133894 16537 net.cpp:68] Memory required for data: 0 +I0906 13:58:58.134050 16537 layer_factory.hpp:74] Creating layer data +I0906 13:58:58.134104 16537 net.cpp:91] Creating Layer data +I0906 13:58:58.134125 16537 net.cpp:369] data -> data +I0906 13:58:58.134229 16537 net.cpp:369] data -> label +I0906 13:58:58.134253 16537 net.cpp:121] Setting up data +I0906 13:58:58.134266 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:58:58.143668 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb +I0906 13:58:58.144057 16537 data_layer.cpp:53] output data size: 100,3,227,227 +I0906 13:58:58.175259 16537 base_data_layer.cpp:43] Initializing prefetch +I0906 13:58:58.175475 16537 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:58:58.175534 16537 net.cpp:128] Top shape: 100 3 227 227 (15458700) +I0906 13:58:58.175544 16537 net.cpp:128] Top shape: 100 (100) +I0906 13:58:58.175547 16537 net.cpp:134] Memory required for data: 61835200 +I0906 13:58:58.175582 16537 layer_factory.hpp:74] Creating layer conv1 +I0906 13:58:58.175659 16537 net.cpp:91] Creating Layer conv1 +I0906 13:58:58.175683 16537 net.cpp:411] conv1 <- data +I0906 13:58:58.175760 16537 net.cpp:369] conv1 -> conv1 +I0906 13:58:58.175793 16537 net.cpp:121] Setting up conv1 +I0906 13:58:58.180706 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:58.180712 16537 net.cpp:134] Memory required for data: 177995200 +I0906 13:58:58.180752 16537 layer_factory.hpp:74] Creating layer relu1 +I0906 13:58:58.180774 16537 net.cpp:91] Creating Layer relu1 +I0906 13:58:58.180780 16537 net.cpp:411] relu1 <- conv1 +I0906 13:58:58.180794 16537 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:58:58.180804 16537 net.cpp:121] Setting up relu1 +I0906 13:58:58.180811 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:58.180815 16537 net.cpp:134] Memory required for data: 294155200 +I0906 13:58:58.180821 16537 layer_factory.hpp:74] Creating layer norm1 +I0906 13:58:58.180848 16537 net.cpp:91] Creating Layer norm1 +I0906 13:58:58.180855 16537 net.cpp:411] norm1 <- conv1 +I0906 13:58:58.180867 16537 net.cpp:369] norm1 -> norm1 +I0906 13:58:58.180881 16537 net.cpp:121] Setting up norm1 +I0906 13:58:58.180905 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) +I0906 13:58:58.180909 16537 net.cpp:134] Memory required for data: 410315200 +I0906 13:58:58.180915 16537 layer_factory.hpp:74] Creating layer pool1 +I0906 13:58:58.180938 16537 net.cpp:91] Creating Layer pool1 +I0906 13:58:58.180944 16537 net.cpp:411] pool1 <- norm1 +I0906 13:58:58.180958 16537 net.cpp:369] pool1 -> pool1 +I0906 13:58:58.180970 16537 net.cpp:121] Setting up pool1 +I0906 13:58:58.180999 16537 net.cpp:128] Top shape: 100 96 27 27 (6998400) +I0906 13:58:58.181004 16537 net.cpp:134] Memory required for data: 438308800 +I0906 13:58:58.181008 16537 layer_factory.hpp:74] Creating layer conv2 +I0906 13:58:58.181023 16537 net.cpp:91] Creating Layer conv2 +I0906 13:58:58.181030 16537 net.cpp:411] conv2 <- pool1 +I0906 13:58:58.181044 16537 net.cpp:369] conv2 -> conv2 +I0906 13:58:58.181056 16537 net.cpp:121] Setting up conv2 +I0906 13:58:58.221200 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:58.221215 16537 net.cpp:134] Memory required for data: 512958400 +I0906 13:58:58.221245 16537 layer_factory.hpp:74] Creating layer relu2 +I0906 13:58:58.221267 16537 net.cpp:91] Creating Layer relu2 +I0906 13:58:58.221277 16537 net.cpp:411] relu2 <- conv2 +I0906 13:58:58.221297 16537 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:58:58.221312 16537 net.cpp:121] Setting up relu2 +I0906 13:58:58.221320 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:58.221324 16537 net.cpp:134] Memory required for data: 587608000 +I0906 13:58:58.221329 16537 layer_factory.hpp:74] Creating layer norm2 +I0906 13:58:58.221346 16537 net.cpp:91] Creating Layer norm2 +I0906 13:58:58.221352 16537 net.cpp:411] norm2 <- conv2 +I0906 13:58:58.221366 16537 net.cpp:369] norm2 -> norm2 +I0906 13:58:58.221379 16537 net.cpp:121] Setting up norm2 +I0906 13:58:58.221397 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) +I0906 13:58:58.221402 16537 net.cpp:134] Memory required for data: 662257600 +I0906 13:58:58.221407 16537 layer_factory.hpp:74] Creating layer pool2 +I0906 13:58:58.221429 16537 net.cpp:91] Creating Layer pool2 +I0906 13:58:58.221436 16537 net.cpp:411] pool2 <- norm2 +I0906 13:58:58.221448 16537 net.cpp:369] pool2 -> pool2 +I0906 13:58:58.221460 16537 net.cpp:121] Setting up pool2 +I0906 13:58:58.221480 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:58.221484 16537 net.cpp:134] Memory required for data: 679563200 +I0906 13:58:58.221534 16537 layer_factory.hpp:74] Creating layer conv3 +I0906 13:58:58.221555 16537 net.cpp:91] Creating Layer conv3 +I0906 13:58:58.221561 16537 net.cpp:411] conv3 <- pool2 +I0906 13:58:58.221576 16537 net.cpp:369] conv3 -> conv3 +I0906 13:58:58.221592 16537 net.cpp:121] Setting up conv3 +I0906 13:58:58.338774 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:58.338798 16537 net.cpp:134] Memory required for data: 705521600 +I0906 13:58:58.338841 16537 layer_factory.hpp:74] Creating layer relu3 +I0906 13:58:58.338876 16537 net.cpp:91] Creating Layer relu3 +I0906 13:58:58.338891 16537 net.cpp:411] relu3 <- conv3 +I0906 13:58:58.338918 16537 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:58:58.338935 16537 net.cpp:121] Setting up relu3 +I0906 13:58:58.338944 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:58.338948 16537 net.cpp:134] Memory required for data: 731480000 +I0906 13:58:58.338953 16537 layer_factory.hpp:74] Creating layer conv4 +I0906 13:58:58.338979 16537 net.cpp:91] Creating Layer conv4 +I0906 13:58:58.338985 16537 net.cpp:411] conv4 <- conv3 +I0906 13:58:58.339002 16537 net.cpp:369] conv4 -> conv4 +I0906 13:58:58.339017 16537 net.cpp:121] Setting up conv4 +I0906 13:58:58.369153 16541 data_layer.cpp:120] Prefetch batch: 193 ms. +I0906 13:58:58.369201 16541 data_layer.cpp:121] Read time: 23.991 ms. +I0906 13:58:58.369210 16541 data_layer.cpp:122] Transform time: 167.322 ms. +I0906 13:58:58.426654 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:58.426676 16537 net.cpp:134] Memory required for data: 757438400 +I0906 13:58:58.426703 16537 layer_factory.hpp:74] Creating layer relu4 +I0906 13:58:58.426735 16537 net.cpp:91] Creating Layer relu4 +I0906 13:58:58.426749 16537 net.cpp:411] relu4 <- conv4 +I0906 13:58:58.426776 16537 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:58:58.426794 16537 net.cpp:121] Setting up relu4 +I0906 13:58:58.426802 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) +I0906 13:58:58.426806 16537 net.cpp:134] Memory required for data: 783396800 +I0906 13:58:58.426811 16537 layer_factory.hpp:74] Creating layer conv5 +I0906 13:58:58.426838 16537 net.cpp:91] Creating Layer conv5 +I0906 13:58:58.426843 16537 net.cpp:411] conv5 <- conv4 +I0906 13:58:58.426858 16537 net.cpp:369] conv5 -> conv5 +I0906 13:58:58.426873 16537 net.cpp:121] Setting up conv5 +I0906 13:58:58.484124 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:58.484143 16537 net.cpp:134] Memory required for data: 800702400 +I0906 13:58:58.484182 16537 layer_factory.hpp:74] Creating layer relu5 +I0906 13:58:58.484212 16537 net.cpp:91] Creating Layer relu5 +I0906 13:58:58.484225 16537 net.cpp:411] relu5 <- conv5 +I0906 13:58:58.484251 16537 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:58:58.484266 16537 net.cpp:121] Setting up relu5 +I0906 13:58:58.484274 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) +I0906 13:58:58.484278 16537 net.cpp:134] Memory required for data: 818008000 +I0906 13:58:58.484282 16537 layer_factory.hpp:74] Creating layer pool5 +I0906 13:58:58.484302 16537 net.cpp:91] Creating Layer pool5 +I0906 13:58:58.484308 16537 net.cpp:411] pool5 <- conv5 +I0906 13:58:58.484321 16537 net.cpp:369] pool5 -> pool5 +I0906 13:58:58.484335 16537 net.cpp:121] Setting up pool5 +I0906 13:58:58.484355 16537 net.cpp:128] Top shape: 100 256 6 6 (921600) +I0906 13:58:58.484359 16537 net.cpp:134] Memory required for data: 821694400 +I0906 13:58:58.484364 16537 layer_factory.hpp:74] Creating layer fc6 +I0906 13:58:58.484400 16537 net.cpp:91] Creating Layer fc6 +I0906 13:58:58.484405 16537 net.cpp:411] fc6 <- pool5 +I0906 13:58:58.484421 16537 net.cpp:369] fc6 -> fc6 +I0906 13:58:58.484434 16537 net.cpp:121] Setting up fc6 +I0906 13:59:03.394265 16537 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:59:03.394289 16537 net.cpp:134] Memory required for data: 823332800 +I0906 13:59:03.394316 16537 layer_factory.hpp:74] Creating layer relu6 +I0906 13:59:03.394362 16537 net.cpp:91] Creating Layer relu6 +I0906 13:59:03.394378 16537 net.cpp:411] relu6 <- fc6 +I0906 13:59:03.394405 16537 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:59:03.394472 16537 net.cpp:121] Setting up relu6 +I0906 13:59:03.394482 16537 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:59:03.394486 16537 net.cpp:134] Memory required for data: 824971200 +I0906 13:59:03.394492 16537 layer_factory.hpp:74] Creating layer fc7 +I0906 13:59:03.394515 16537 net.cpp:91] Creating Layer fc7 +I0906 13:59:03.394521 16537 net.cpp:411] fc7 <- fc6 +I0906 13:59:03.394537 16537 net.cpp:369] fc7 -> fc7 +I0906 13:59:03.394558 16537 net.cpp:121] Setting up fc7 +I0906 13:59:05.554731 16537 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:59:05.554755 16537 net.cpp:134] Memory required for data: 826609600 +I0906 13:59:05.554782 16537 layer_factory.hpp:74] Creating layer relu7 +I0906 13:59:05.554815 16537 net.cpp:91] Creating Layer relu7 +I0906 13:59:05.554829 16537 net.cpp:411] relu7 <- fc7 +I0906 13:59:05.554855 16537 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:59:05.554870 16537 net.cpp:121] Setting up relu7 +I0906 13:59:05.554879 16537 net.cpp:128] Top shape: 100 4096 (409600) +I0906 13:59:05.554883 16537 net.cpp:134] Memory required for data: 828248000 +I0906 13:59:05.554888 16537 layer_factory.hpp:74] Creating layer fc8 +I0906 13:59:05.554911 16537 net.cpp:91] Creating Layer fc8 +I0906 13:59:05.554916 16537 net.cpp:411] fc8 <- fc7 +I0906 13:59:05.554932 16537 net.cpp:369] fc8 -> fc8 +I0906 13:59:05.554946 16537 net.cpp:121] Setting up fc8 +I0906 13:59:06.080322 16537 net.cpp:128] Top shape: 100 1000 (100000) +I0906 13:59:06.080343 16537 net.cpp:134] Memory required for data: 828648000 +I0906 13:59:06.080370 16537 layer_factory.hpp:74] Creating layer loss +I0906 13:59:06.080420 16537 net.cpp:91] Creating Layer loss +I0906 13:59:06.080435 16537 net.cpp:411] loss <- fc8 +I0906 13:59:06.080457 16537 net.cpp:411] loss <- label +I0906 13:59:06.080476 16537 net.cpp:369] loss -> loss +I0906 13:59:06.080497 16537 net.cpp:121] Setting up loss +I0906 13:59:06.080515 16537 layer_factory.hpp:74] Creating layer loss +I0906 13:59:06.081025 16537 net.cpp:128] Top shape: (1) +I0906 13:59:06.081030 16537 net.cpp:130] with loss weight 1 +I0906 13:59:06.081048 16537 net.cpp:134] Memory required for data: 828648004 +I0906 13:59:06.081055 16537 net.cpp:193] loss needs backward computation. +I0906 13:59:06.081063 16537 net.cpp:193] fc8 needs backward computation. +I0906 13:59:06.081069 16537 net.cpp:193] relu7 needs backward computation. +I0906 13:59:06.081074 16537 net.cpp:193] fc7 needs backward computation. +I0906 13:59:06.081080 16537 net.cpp:193] relu6 needs backward computation. +I0906 13:59:06.081086 16537 net.cpp:193] fc6 needs backward computation. +I0906 13:59:06.081091 16537 net.cpp:193] pool5 needs backward computation. +I0906 13:59:06.081097 16537 net.cpp:193] relu5 needs backward computation. +I0906 13:59:06.081102 16537 net.cpp:193] conv5 needs backward computation. +I0906 13:59:06.081109 16537 net.cpp:193] relu4 needs backward computation. +I0906 13:59:06.081114 16537 net.cpp:193] conv4 needs backward computation. +I0906 13:59:06.081120 16537 net.cpp:193] relu3 needs backward computation. +I0906 13:59:06.081125 16537 net.cpp:193] conv3 needs backward computation. +I0906 13:59:06.081132 16537 net.cpp:193] pool2 needs backward computation. +I0906 13:59:06.081138 16537 net.cpp:193] norm2 needs backward computation. +I0906 13:59:06.081145 16537 net.cpp:193] relu2 needs backward computation. +I0906 13:59:06.081149 16537 net.cpp:193] conv2 needs backward computation. +I0906 13:59:06.081156 16537 net.cpp:193] pool1 needs backward computation. +I0906 13:59:06.081161 16537 net.cpp:193] norm1 needs backward computation. +I0906 13:59:06.081167 16537 net.cpp:193] relu1 needs backward computation. +I0906 13:59:06.081173 16537 net.cpp:193] conv1 needs backward computation. +I0906 13:59:06.081181 16537 net.cpp:195] data does not need backward computation. +I0906 13:59:06.081187 16537 net.cpp:236] This network produces output loss +I0906 13:59:06.081223 16537 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:59:06.081238 16537 net.cpp:248] Network initialization done. +I0906 13:59:06.081241 16537 net.cpp:249] Memory required for data: 828648004 +I0906 13:59:06.082168 16537 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt +I0906 13:59:06.082299 16537 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data +I0906 13:59:06.082527 16537 net.cpp:43] Initializing net from parameters: +name: "AlexNet" +state { + phase: TEST +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} +I0906 13:59:06.082866 16537 net.cpp:68] Memory required for data: 0 +I0906 13:59:06.082913 16537 layer_factory.hpp:74] Creating layer data +I0906 13:59:06.082934 16537 net.cpp:91] Creating Layer data +I0906 13:59:06.082944 16537 net.cpp:369] data -> data +I0906 13:59:06.082967 16537 net.cpp:369] data -> label +I0906 13:59:06.082981 16537 net.cpp:121] Setting up data +I0906 13:59:06.082988 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto +I0906 13:59:06.091397 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb +I0906 13:59:06.091647 16537 data_layer.cpp:53] output data size: 50,3,227,227 +I0906 13:59:06.107939 16537 base_data_layer.cpp:43] Initializing prefetch +I0906 13:59:06.108054 16537 base_data_layer.cpp:45] Prefetch initialized. +I0906 13:59:06.108088 16537 net.cpp:128] Top shape: 50 3 227 227 (7729350) +I0906 13:59:06.108098 16537 net.cpp:128] Top shape: 50 (50) +I0906 13:59:06.108101 16537 net.cpp:134] Memory required for data: 30917600 +I0906 13:59:06.108135 16537 layer_factory.hpp:74] Creating layer label_data_1_split +I0906 13:59:06.108201 16537 net.cpp:91] Creating Layer label_data_1_split +I0906 13:59:06.108216 16537 net.cpp:411] label_data_1_split <- label +I0906 13:59:06.108259 16537 net.cpp:369] label_data_1_split -> label_data_1_split_0 +I0906 13:59:06.108306 16537 net.cpp:369] label_data_1_split -> label_data_1_split_1 +I0906 13:59:06.108319 16537 net.cpp:121] Setting up label_data_1_split +I0906 13:59:06.108353 16537 net.cpp:128] Top shape: 50 (50) +I0906 13:59:06.108361 16537 net.cpp:128] Top shape: 50 (50) +I0906 13:59:06.108364 16537 net.cpp:134] Memory required for data: 30918000 +I0906 13:59:06.108369 16537 layer_factory.hpp:74] Creating layer conv1 +I0906 13:59:06.108403 16537 net.cpp:91] Creating Layer conv1 +I0906 13:59:06.108409 16537 net.cpp:411] conv1 <- data +I0906 13:59:06.108425 16537 net.cpp:369] conv1 -> conv1 +I0906 13:59:06.108440 16537 net.cpp:121] Setting up conv1 +I0906 13:59:06.113059 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:59:06.113065 16537 net.cpp:134] Memory required for data: 88998000 +I0906 13:59:06.113085 16537 layer_factory.hpp:74] Creating layer relu1 +I0906 13:59:06.113097 16537 net.cpp:91] Creating Layer relu1 +I0906 13:59:06.113103 16537 net.cpp:411] relu1 <- conv1 +I0906 13:59:06.113116 16537 net.cpp:358] relu1 -> conv1 (in-place) +I0906 13:59:06.113126 16537 net.cpp:121] Setting up relu1 +I0906 13:59:06.113134 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:59:06.113138 16537 net.cpp:134] Memory required for data: 147078000 +I0906 13:59:06.113143 16537 layer_factory.hpp:74] Creating layer norm1 +I0906 13:59:06.113163 16537 net.cpp:91] Creating Layer norm1 +I0906 13:59:06.113169 16537 net.cpp:411] norm1 <- conv1 +I0906 13:59:06.113183 16537 net.cpp:369] norm1 -> norm1 +I0906 13:59:06.113193 16537 net.cpp:121] Setting up norm1 +I0906 13:59:06.113212 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) +I0906 13:59:06.113255 16537 net.cpp:134] Memory required for data: 205158000 +I0906 13:59:06.113260 16537 layer_factory.hpp:74] Creating layer pool1 +I0906 13:59:06.113277 16537 net.cpp:91] Creating Layer pool1 +I0906 13:59:06.113282 16537 net.cpp:411] pool1 <- norm1 +I0906 13:59:06.113296 16537 net.cpp:369] pool1 -> pool1 +I0906 13:59:06.113306 16537 net.cpp:121] Setting up pool1 +I0906 13:59:06.113325 16537 net.cpp:128] Top shape: 50 96 27 27 (3499200) +I0906 13:59:06.113329 16537 net.cpp:134] Memory required for data: 219154800 +I0906 13:59:06.113334 16537 layer_factory.hpp:74] Creating layer conv2 +I0906 13:59:06.113348 16537 net.cpp:91] Creating Layer conv2 +I0906 13:59:06.113354 16537 net.cpp:411] conv2 <- pool1 +I0906 13:59:06.113369 16537 net.cpp:369] conv2 -> conv2 +I0906 13:59:06.113381 16537 net.cpp:121] Setting up conv2 +I0906 13:59:06.154265 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:59:06.154281 16537 net.cpp:134] Memory required for data: 256479600 +I0906 13:59:06.154316 16537 layer_factory.hpp:74] Creating layer relu2 +I0906 13:59:06.154345 16537 net.cpp:91] Creating Layer relu2 +I0906 13:59:06.154355 16537 net.cpp:411] relu2 <- conv2 +I0906 13:59:06.154374 16537 net.cpp:358] relu2 -> conv2 (in-place) +I0906 13:59:06.154387 16537 net.cpp:121] Setting up relu2 +I0906 13:59:06.154397 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:59:06.154400 16537 net.cpp:134] Memory required for data: 293804400 +I0906 13:59:06.154405 16537 layer_factory.hpp:74] Creating layer norm2 +I0906 13:59:06.154427 16537 net.cpp:91] Creating Layer norm2 +I0906 13:59:06.154433 16537 net.cpp:411] norm2 <- conv2 +I0906 13:59:06.154446 16537 net.cpp:369] norm2 -> norm2 +I0906 13:59:06.154463 16537 net.cpp:121] Setting up norm2 +I0906 13:59:06.154484 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) +I0906 13:59:06.154503 16537 net.cpp:134] Memory required for data: 331129200 +I0906 13:59:06.154508 16537 layer_factory.hpp:74] Creating layer pool2 +I0906 13:59:06.154525 16537 net.cpp:91] Creating Layer pool2 +I0906 13:59:06.154531 16537 net.cpp:411] pool2 <- norm2 +I0906 13:59:06.154544 16537 net.cpp:369] pool2 -> pool2 +I0906 13:59:06.154556 16537 net.cpp:121] Setting up pool2 +I0906 13:59:06.154573 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:59:06.154578 16537 net.cpp:134] Memory required for data: 339782000 +I0906 13:59:06.154583 16537 layer_factory.hpp:74] Creating layer conv3 +I0906 13:59:06.154604 16537 net.cpp:91] Creating Layer conv3 +I0906 13:59:06.154610 16537 net.cpp:411] conv3 <- pool2 +I0906 13:59:06.154625 16537 net.cpp:369] conv3 -> conv3 +I0906 13:59:06.154638 16537 net.cpp:121] Setting up conv3 +I0906 13:59:06.204232 16545 data_layer.cpp:120] Prefetch batch: 96 ms. +I0906 13:59:06.204263 16545 data_layer.cpp:121] Read time: 12.163 ms. +I0906 13:59:06.204272 16545 data_layer.cpp:122] Transform time: 82.876 ms. +I0906 13:59:06.270438 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:59:06.270459 16537 net.cpp:134] Memory required for data: 352761200 +I0906 13:59:06.270499 16537 layer_factory.hpp:74] Creating layer relu3 +I0906 13:59:06.270532 16537 net.cpp:91] Creating Layer relu3 +I0906 13:59:06.270546 16537 net.cpp:411] relu3 <- conv3 +I0906 13:59:06.270571 16537 net.cpp:358] relu3 -> conv3 (in-place) +I0906 13:59:06.270587 16537 net.cpp:121] Setting up relu3 +I0906 13:59:06.270596 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:59:06.270601 16537 net.cpp:134] Memory required for data: 365740400 +I0906 13:59:06.270606 16537 layer_factory.hpp:74] Creating layer conv4 +I0906 13:59:06.270630 16537 net.cpp:91] Creating Layer conv4 +I0906 13:59:06.270637 16537 net.cpp:411] conv4 <- conv3 +I0906 13:59:06.270651 16537 net.cpp:369] conv4 -> conv4 +I0906 13:59:06.270666 16537 net.cpp:121] Setting up conv4 +I0906 13:59:06.357051 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:59:06.357074 16537 net.cpp:134] Memory required for data: 378719600 +I0906 13:59:06.357100 16537 layer_factory.hpp:74] Creating layer relu4 +I0906 13:59:06.357132 16537 net.cpp:91] Creating Layer relu4 +I0906 13:59:06.357184 16537 net.cpp:411] relu4 <- conv4 +I0906 13:59:06.357210 16537 net.cpp:358] relu4 -> conv4 (in-place) +I0906 13:59:06.357226 16537 net.cpp:121] Setting up relu4 +I0906 13:59:06.357235 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) +I0906 13:59:06.357239 16537 net.cpp:134] Memory required for data: 391698800 +I0906 13:59:06.357244 16537 layer_factory.hpp:74] Creating layer conv5 +I0906 13:59:06.357270 16537 net.cpp:91] Creating Layer conv5 +I0906 13:59:06.357276 16537 net.cpp:411] conv5 <- conv4 +I0906 13:59:06.357292 16537 net.cpp:369] conv5 -> conv5 +I0906 13:59:06.357308 16537 net.cpp:121] Setting up conv5 +I0906 13:59:06.414666 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:59:06.414685 16537 net.cpp:134] Memory required for data: 400351600 +I0906 13:59:06.414727 16537 layer_factory.hpp:74] Creating layer relu5 +I0906 13:59:06.414757 16537 net.cpp:91] Creating Layer relu5 +I0906 13:59:06.414770 16537 net.cpp:411] relu5 <- conv5 +I0906 13:59:06.414794 16537 net.cpp:358] relu5 -> conv5 (in-place) +I0906 13:59:06.414808 16537 net.cpp:121] Setting up relu5 +I0906 13:59:06.414818 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) +I0906 13:59:06.414820 16537 net.cpp:134] Memory required for data: 409004400 +I0906 13:59:06.414825 16537 layer_factory.hpp:74] Creating layer pool5 +I0906 13:59:06.414855 16537 net.cpp:91] Creating Layer pool5 +I0906 13:59:06.414860 16537 net.cpp:411] pool5 <- conv5 +I0906 13:59:06.414875 16537 net.cpp:369] pool5 -> pool5 +I0906 13:59:06.414888 16537 net.cpp:121] Setting up pool5 +I0906 13:59:06.414908 16537 net.cpp:128] Top shape: 50 256 6 6 (460800) +I0906 13:59:06.414912 16537 net.cpp:134] Memory required for data: 410847600 +I0906 13:59:06.414917 16537 layer_factory.hpp:74] Creating layer fc6 +I0906 13:59:06.414938 16537 net.cpp:91] Creating Layer fc6 +I0906 13:59:06.414944 16537 net.cpp:411] fc6 <- pool5 +I0906 13:59:06.414959 16537 net.cpp:369] fc6 -> fc6 +I0906 13:59:06.414971 16537 net.cpp:121] Setting up fc6 +I0906 13:59:11.292778 16537 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:59:11.292801 16537 net.cpp:134] Memory required for data: 411666800 +I0906 13:59:11.292829 16537 layer_factory.hpp:74] Creating layer relu6 +I0906 13:59:11.292860 16537 net.cpp:91] Creating Layer relu6 +I0906 13:59:11.292876 16537 net.cpp:411] relu6 <- fc6 +I0906 13:59:11.292902 16537 net.cpp:358] relu6 -> fc6 (in-place) +I0906 13:59:11.292918 16537 net.cpp:121] Setting up relu6 +I0906 13:59:11.292927 16537 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:59:11.292932 16537 net.cpp:134] Memory required for data: 412486000 +I0906 13:59:11.292937 16537 layer_factory.hpp:74] Creating layer fc7 +I0906 13:59:11.292958 16537 net.cpp:91] Creating Layer fc7 +I0906 13:59:11.292964 16537 net.cpp:411] fc7 <- fc6 +I0906 13:59:11.292980 16537 net.cpp:369] fc7 -> fc7 +I0906 13:59:11.292995 16537 net.cpp:121] Setting up fc7 +I0906 13:59:13.449043 16537 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:59:13.449066 16537 net.cpp:134] Memory required for data: 413305200 +I0906 13:59:13.449095 16537 layer_factory.hpp:74] Creating layer relu7 +I0906 13:59:13.449126 16537 net.cpp:91] Creating Layer relu7 +I0906 13:59:13.449141 16537 net.cpp:411] relu7 <- fc7 +I0906 13:59:13.449167 16537 net.cpp:358] relu7 -> fc7 (in-place) +I0906 13:59:13.449182 16537 net.cpp:121] Setting up relu7 +I0906 13:59:13.449192 16537 net.cpp:128] Top shape: 50 4096 (204800) +I0906 13:59:13.449195 16537 net.cpp:134] Memory required for data: 414124400 +I0906 13:59:13.449200 16537 layer_factory.hpp:74] Creating layer fc8 +I0906 13:59:13.449223 16537 net.cpp:91] Creating Layer fc8 +I0906 13:59:13.449229 16537 net.cpp:411] fc8 <- fc7 +I0906 13:59:13.449244 16537 net.cpp:369] fc8 -> fc8 +I0906 13:59:13.449270 16537 net.cpp:121] Setting up fc8 +I0906 13:59:13.974771 16537 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:59:13.974793 16537 net.cpp:134] Memory required for data: 414324400 +I0906 13:59:13.974820 16537 layer_factory.hpp:74] Creating layer fc8_fc8_0_split +I0906 13:59:13.974851 16537 net.cpp:91] Creating Layer fc8_fc8_0_split +I0906 13:59:13.974911 16537 net.cpp:411] fc8_fc8_0_split <- fc8 +I0906 13:59:13.974939 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 +I0906 13:59:13.974962 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 +I0906 13:59:13.974974 16537 net.cpp:121] Setting up fc8_fc8_0_split +I0906 13:59:13.974992 16537 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:59:13.974998 16537 net.cpp:128] Top shape: 50 1000 (50000) +I0906 13:59:13.975003 16537 net.cpp:134] Memory required for data: 414724400 +I0906 13:59:13.975006 16537 layer_factory.hpp:74] Creating layer accuracy +I0906 13:59:13.975038 16537 net.cpp:91] Creating Layer accuracy +I0906 13:59:13.975044 16537 net.cpp:411] accuracy <- fc8_fc8_0_split_0 +I0906 13:59:13.975054 16537 net.cpp:411] accuracy <- label_data_1_split_0 +I0906 13:59:13.975065 16537 net.cpp:369] accuracy -> accuracy +I0906 13:59:13.975076 16537 net.cpp:121] Setting up accuracy +I0906 13:59:13.975092 16537 net.cpp:128] Top shape: (1) +I0906 13:59:13.975096 16537 net.cpp:134] Memory required for data: 414724404 +I0906 13:59:13.975101 16537 layer_factory.hpp:74] Creating layer loss +I0906 13:59:13.975112 16537 net.cpp:91] Creating Layer loss +I0906 13:59:13.975117 16537 net.cpp:411] loss <- fc8_fc8_0_split_1 +I0906 13:59:13.975128 16537 net.cpp:411] loss <- label_data_1_split_1 +I0906 13:59:13.975139 16537 net.cpp:369] loss -> loss +I0906 13:59:13.975150 16537 net.cpp:121] Setting up loss +I0906 13:59:13.975160 16537 layer_factory.hpp:74] Creating layer loss +I0906 13:59:13.975487 16537 net.cpp:128] Top shape: (1) +I0906 13:59:13.975492 16537 net.cpp:130] with loss weight 1 +I0906 13:59:13.975507 16537 net.cpp:134] Memory required for data: 414724408 +I0906 13:59:13.975513 16537 net.cpp:193] loss needs backward computation. +I0906 13:59:13.975520 16537 net.cpp:195] accuracy does not need backward computation. +I0906 13:59:13.975528 16537 net.cpp:193] fc8_fc8_0_split needs backward computation. +I0906 13:59:13.975533 16537 net.cpp:193] fc8 needs backward computation. +I0906 13:59:13.975538 16537 net.cpp:193] relu7 needs backward computation. +I0906 13:59:13.975544 16537 net.cpp:193] fc7 needs backward computation. +I0906 13:59:13.975549 16537 net.cpp:193] relu6 needs backward computation. +I0906 13:59:13.975555 16537 net.cpp:193] fc6 needs backward computation. +I0906 13:59:13.975560 16537 net.cpp:193] pool5 needs backward computation. +I0906 13:59:13.975566 16537 net.cpp:193] relu5 needs backward computation. +I0906 13:59:13.975572 16537 net.cpp:193] conv5 needs backward computation. +I0906 13:59:13.975577 16537 net.cpp:193] relu4 needs backward computation. +I0906 13:59:13.975582 16537 net.cpp:193] conv4 needs backward computation. +I0906 13:59:13.975589 16537 net.cpp:193] relu3 needs backward computation. +I0906 13:59:13.975594 16537 net.cpp:193] conv3 needs backward computation. +I0906 13:59:13.975600 16537 net.cpp:193] pool2 needs backward computation. +I0906 13:59:13.975605 16537 net.cpp:193] norm2 needs backward computation. +I0906 13:59:13.975611 16537 net.cpp:193] relu2 needs backward computation. +I0906 13:59:13.975616 16537 net.cpp:193] conv2 needs backward computation. +I0906 13:59:13.975622 16537 net.cpp:193] pool1 needs backward computation. +I0906 13:59:13.975628 16537 net.cpp:193] norm1 needs backward computation. +I0906 13:59:13.975635 16537 net.cpp:193] relu1 needs backward computation. +I0906 13:59:13.975639 16537 net.cpp:193] conv1 needs backward computation. +I0906 13:59:13.975646 16537 net.cpp:195] label_data_1_split does not need backward computation. +I0906 13:59:13.975654 16537 net.cpp:195] data does not need backward computation. +I0906 13:59:13.975658 16537 net.cpp:236] This network produces output accuracy +I0906 13:59:13.975664 16537 net.cpp:236] This network produces output loss +I0906 13:59:13.975702 16537 net.cpp:483] Collecting Learning Rate and Weight Decay. +I0906 13:59:13.975714 16537 net.cpp:248] Network initialization done. +I0906 13:59:13.975718 16537 net.cpp:249] Memory required for data: 414724408 +I0906 13:59:13.975903 16537 solver.cpp:53] Solver scaffolding done. +I0906 13:59:13.976030 16537 solver.cpp:270] Solving AlexNet +I0906 13:59:13.976050 16537 solver.cpp:271] Learning Rate Policy: step +I0906 13:59:13.977635 16537 solver.cpp:314] Iteration 0, Testing net (#0) +I0906 13:59:13.977653 16537 net.cpp:696] Copying source layer data +I0906 13:59:13.977660 16537 net.cpp:696] Copying source layer conv1 +I0906 13:59:13.980556 16537 net.cpp:696] Copying source layer relu1 +I0906 13:59:13.980595 16537 net.cpp:696] Copying source layer norm1 +I0906 13:59:13.980607 16537 net.cpp:696] Copying source layer pool1 +I0906 13:59:13.980617 16537 net.cpp:696] Copying source layer conv2 +I0906 13:59:13.980785 16537 net.cpp:696] Copying source layer relu2 +I0906 13:59:13.980798 16537 net.cpp:696] Copying source layer norm2 +I0906 13:59:13.980808 16537 net.cpp:696] Copying source layer pool2 +I0906 13:59:13.980818 16537 net.cpp:696] Copying source layer conv3 +I0906 13:59:13.981422 16537 net.cpp:696] Copying source layer relu3 +I0906 13:59:13.981437 16537 net.cpp:696] Copying source layer conv4 +I0906 13:59:13.982098 16537 net.cpp:696] Copying source layer relu4 +I0906 13:59:13.982115 16537 net.cpp:696] Copying source layer conv5 +I0906 13:59:13.982612 16537 net.cpp:696] Copying source layer relu5 +I0906 13:59:13.982626 16537 net.cpp:696] Copying source layer pool5 +I0906 13:59:13.982636 16537 net.cpp:696] Copying source layer fc6 +I0906 13:59:13.993058 16537 net.cpp:696] Copying source layer relu6 +I0906 13:59:13.993091 16537 net.cpp:696] Copying source layer fc7 +I0906 13:59:13.997967 16537 net.cpp:696] Copying source layer relu7 +I0906 13:59:13.997984 16537 net.cpp:696] Copying source layer fc8 +I0906 13:59:13.998755 16537 net.cpp:696] Copying source layer loss +I0906 13:59:13.998867 16537 base_data_layer.cpp:89] Thread joined +I0906 13:59:14.003283 16537 base_data_layer.cpp:93] Prefetch copied +I0906 13:59:14.003650 16537 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:59:14.096194 16546 data_layer.cpp:120] Prefetch batch: 92 ms. +I0906 13:59:14.096225 16546 data_layer.cpp:121] Read time: 12.131 ms. +I0906 13:59:14.096233 16546 data_layer.cpp:122] Transform time: 79.106 ms. +I0906 13:59:17.032117 16537 solver.cpp:363] Test net output #0: accuracy = 0 +I0906 13:59:17.032146 16537 solver.cpp:363] Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss) +I0906 13:59:17.032196 16537 base_data_layer.cpp:89] Thread joined +I0906 13:59:17.041095 16537 base_data_layer.cpp:93] Prefetch copied +I0906 13:59:17.041471 16537 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:59:17.232076 16547 data_layer.cpp:120] Prefetch batch: 190 ms. +I0906 13:59:17.232108 16547 data_layer.cpp:121] Read time: 24.399 ms. +I0906 13:59:17.232116 16547 data_layer.cpp:122] Transform time: 164.272 ms. +I0906 13:59:23.802855 16537 solver.cpp:234] Iteration 0, loss = 0 +I0906 13:59:23.802914 16537 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) +I0906 13:59:23.802963 16537 solver.cpp:506] Iteration 0, lr = 0.01 +I0906 13:59:23.918314 16537 base_data_layer.cpp:89] Thread joined +I0906 13:59:23.926301 16537 base_data_layer.cpp:93] Prefetch copied +I0906 13:59:23.926447 16537 base_data_layer.cpp:104] CreatePrefetchThread +I0906 13:59:24.110566 16549 data_layer.cpp:120] Prefetch batch: 183 ms. +I0906 13:59:24.110599 16549 data_layer.cpp:121] Read time: 23.839 ms. +I0906 13:59:24.110605 16549 data_layer.cpp:122] Transform time: 158.415 ms. +I0906 13:59:26.694295 16537 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.INFO b/log/caffe.INFO new file mode 120000 index 00000000..65520a80 --- /dev/null +++ b/log/caffe.INFO @@ -0,0 +1 @@ +caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 \ No newline at end of file diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index df2de2e0..7e745410 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -33,7 +33,7 @@ #include namespace caffe { -char* buildOption = "-x clc++ "; +string buildOption = "-x clc++ "; std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; @@ -148,7 +148,7 @@ void Device::BuildProgram(std::string kernel_dir) if(NULL == Program){ fprintf(stderr,"Err: Failed to create program\n"); } - cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL); + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), NULL, NULL); LOG(INFO) << "Build Program"; if(CL_SUCCESS != iStatus){ fprintf(stderr,"Err: Failed to build program\n"); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 5ea9b6b5..cd9d2ef5 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -229,7 +229,7 @@ void Solver::Step(int iters) { losses[idx] = loss; printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx); } - printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %d \n", smoothed_loss,average_loss, losses.size()); + printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", smoothed_loss,average_loss, losses.size()); if (display) { LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 123b0053..8cf9bc7b 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -48,16 +48,6 @@ if (cpu_ptr_ && own_cpu_data_) { } clReleaseKernel(oclmem_kernel); -/* if (cpu_ptr_ && own_cpu_data_) { - CaffeFreeHost(cpu_ptr_); - } - -#ifndef CPU_ONLY - if (gpu_ptr_) { - CUDA_CHECK(cudaFree(gpu_ptr_)); - } -#endif // CPU_ONLY -*/ } void SyncedMemory::ocl_setup() { @@ -69,13 +59,7 @@ void SyncedMemory::ocl_setup() { inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: - //allocate pre-pinned memory - //pinned_buffer_ptr_ - // if(data_layer_){ - // gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_PERSISTENT_MEM_AMD, size_, NULL, NULL); - // } - // else{ - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); //} cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); memset(cpu_ptr_, 0, size_); @@ -151,17 +135,6 @@ const void* SyncedMemory::cpu_data() { } void SyncedMemory::set_cpu_data(void* data) { -/*CHECK(data); - if (own_cpu_data_) { - OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL)); - OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_)); - clFinish(amdDevice.CommandQueue); //is this necessary? - } - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_HOST_PTR, size_, data, NULL); - cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); - head_ = HEAD_AT_CPU; - own_cpu_data_ = false; -*/ CHECK(data); if (own_cpu_data_) { CaffeFreeHost(cpu_ptr_); @@ -196,8 +169,10 @@ void* SyncedMemory::mutable_gpu_data() { #endif } -const void *SyncedMemory::gpu_cache_data() -{ +const void *SyncedMemory::gpu_cache_data() { + return 0; } + + } // namespace caffe From f96ca7623084ed162e94f952f606a07d72e9956d Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 6 Sep 2015 14:02:55 +0800 Subject: [PATCH 058/124] Clean up the last two warings --- ...SEARCH.yugao.log.INFO.20150906-133002.7951 | 1250 ----------------- ...SEARCH.yugao.log.INFO.20150906-133358.8300 | 1208 ---------------- ...SEARCH.yugao.log.INFO.20150906-133437.8316 | 1208 ---------------- ...EARCH.yugao.log.INFO.20150906-135805.16515 | 1160 --------------- ...EARCH.yugao.log.INFO.20150906-135855.16537 | 1208 ---------------- log/caffe.INFO | 1 - 6 files changed, 6035 deletions(-) delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 delete mode 120000 log/caffe.INFO diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 deleted file mode 100644 index c75e1aaa..00000000 --- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 +++ /dev/null @@ -1,1250 +0,0 @@ -Log file created at: 2015/09/06 13:30:02 -Running on machine: AMD-RESEARCH -Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -I0906 13:30:02.150327 7951 caffe.cpp:114] Use GPU with device ID 0 -I0906 13:30:02.187862 7951 device.cpp:230] Number of platforms found:1 -I0906 13:30:02.187903 7951 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing -I0906 13:30:02.187918 7951 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE -I0906 13:30:02.187973 7951 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) -I0906 13:30:02.187980 7951 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. -I0906 13:30:02.187991 7951 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices -I0906 13:30:02.188000 7951 device.cpp:286] Number of devices found:1 -I0906 13:30:02.188005 7951 device.cpp:288] DeviceID: 0x2171230 -I0906 13:30:02.188025 7951 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU -I0906 13:30:02.188033 7951 device.cpp:393] Is it integrated GPU?: 0 -I0906 13:30:02.188038 7951 device.cpp:393] Max clock frequency MHz: 930 -I0906 13:30:02.188043 7951 device.cpp:393] Host-Device unified mem: 0 -I0906 13:30:02.188048 7951 device.cpp:393] ECC support: 0 -I0906 13:30:02.188052 7951 device.cpp:393] Endian little: 1 -I0906 13:30:02.188056 7951 device.cpp:393] Max compute units: 44 -I0906 13:30:02.188061 7951 device.cpp:393] Max work group size: 256 -I0906 13:30:02.188066 7951 device.cpp:393] Max work item dimensions: 3 -I0906 13:30:02.188072 7951 device.cpp:393] Max work item sizes: 0x100 -I0906 13:30:02.188078 7951 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE -I0906 13:30:02.188083 7951 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL -I0906 13:30:02.188088 7951 device.cpp:393] Max mem alloc size: 4244635648 -I0906 13:30:02.188092 7951 device.cpp:393] Global mem size: 16878927872 -I0906 13:30:02.188097 7951 device.cpp:393] Local mem size: 32768 -I0906 13:30:02.188107 7951 device.cpp:96] Picked device type : GPU 0 -I0906 13:30:04.630481 7951 device.cpp:152] Build Program -I0906 13:30:04.630708 7951 caffe.cpp:122] Starting Optimization -I0906 13:30:04.630797 7951 solver.cpp:40] Initializing solver from parameters: -test_iter: 1 -test_interval: 1000 -base_lr: 0.01 -display: 1 -max_iter: 450000 -lr_policy: "step" -gamma: 0.1 -momentum: 0.9 -weight_decay: 0.0005 -stepsize: 100000 -snapshot: 10000 -snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" -solver_mode: GPU -net: "models/bvlc_alexnet/train_val.prototxt" -I0906 13:30:04.630909 7951 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val.prototxt -I0906 13:30:04.632081 7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data -I0906 13:30:04.632134 7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy -I0906 13:30:04.632319 7951 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TRAIN -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" - batch_size: 256 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "drop6" - type: "Dropout" - bottom: "fc6" - top: "fc6" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "drop7" - type: "Dropout" - bottom: "fc7" - top: "fc7" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:30:04.632813 7951 net.cpp:68] Memory required for data: 0 -I0906 13:30:04.632977 7951 layer_factory.hpp:74] Creating layer data -I0906 13:30:04.633033 7951 net.cpp:91] Creating Layer data -I0906 13:30:04.633055 7951 net.cpp:369] data -> data -I0906 13:30:04.633160 7951 net.cpp:369] data -> label -I0906 13:30:04.633183 7951 net.cpp:121] Setting up data -I0906 13:30:04.633196 7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:30:04.642779 7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb -I0906 13:30:04.643064 7951 data_layer.cpp:53] output data size: 256,3,227,227 -I0906 13:30:04.723888 7951 base_data_layer.cpp:43] Initializing prefetch -I0906 13:30:04.724091 7951 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:30:04.724150 7951 net.cpp:128] Top shape: 256 3 227 227 (39574272) -I0906 13:30:04.724161 7951 net.cpp:128] Top shape: 256 (256) -I0906 13:30:04.724165 7951 net.cpp:134] Memory required for data: 158298112 -I0906 13:30:04.724201 7951 layer_factory.hpp:74] Creating layer conv1 -I0906 13:30:04.724283 7951 net.cpp:91] Creating Layer conv1 -I0906 13:30:04.724328 7951 net.cpp:411] conv1 <- data -I0906 13:30:04.724383 7951 net.cpp:369] conv1 -> conv1 -I0906 13:30:04.724417 7951 net.cpp:121] Setting up conv1 -I0906 13:30:04.729287 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) -I0906 13:30:04.729295 7951 net.cpp:134] Memory required for data: 455667712 -I0906 13:30:04.729333 7951 layer_factory.hpp:74] Creating layer relu1 -I0906 13:30:04.729357 7951 net.cpp:91] Creating Layer relu1 -I0906 13:30:04.729362 7951 net.cpp:411] relu1 <- conv1 -I0906 13:30:04.729377 7951 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:30:04.729385 7951 net.cpp:121] Setting up relu1 -I0906 13:30:04.729408 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) -I0906 13:30:04.729411 7951 net.cpp:134] Memory required for data: 753037312 -I0906 13:30:04.729416 7951 layer_factory.hpp:74] Creating layer norm1 -I0906 13:30:04.729444 7951 net.cpp:91] Creating Layer norm1 -I0906 13:30:04.729450 7951 net.cpp:411] norm1 <- conv1 -I0906 13:30:04.729463 7951 net.cpp:369] norm1 -> norm1 -I0906 13:30:04.729476 7951 net.cpp:121] Setting up norm1 -I0906 13:30:04.729499 7951 net.cpp:128] Top shape: 256 96 55 55 (74342400) -I0906 13:30:04.729504 7951 net.cpp:134] Memory required for data: 1050406912 -I0906 13:30:04.729509 7951 layer_factory.hpp:74] Creating layer pool1 -I0906 13:30:04.729532 7951 net.cpp:91] Creating Layer pool1 -I0906 13:30:04.729537 7951 net.cpp:411] pool1 <- norm1 -I0906 13:30:04.729550 7951 net.cpp:369] pool1 -> pool1 -I0906 13:30:04.729564 7951 net.cpp:121] Setting up pool1 -I0906 13:30:04.729591 7951 net.cpp:128] Top shape: 256 96 27 27 (17915904) -I0906 13:30:04.729596 7951 net.cpp:134] Memory required for data: 1122070528 -I0906 13:30:04.729600 7951 layer_factory.hpp:74] Creating layer conv2 -I0906 13:30:04.729614 7951 net.cpp:91] Creating Layer conv2 -I0906 13:30:04.729619 7951 net.cpp:411] conv2 <- pool1 -I0906 13:30:04.729635 7951 net.cpp:369] conv2 -> conv2 -I0906 13:30:04.729647 7951 net.cpp:121] Setting up conv2 -I0906 13:30:04.769634 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) -I0906 13:30:04.769649 7951 net.cpp:134] Memory required for data: 1313173504 -I0906 13:30:04.769673 7951 layer_factory.hpp:74] Creating layer relu2 -I0906 13:30:04.769695 7951 net.cpp:91] Creating Layer relu2 -I0906 13:30:04.769704 7951 net.cpp:411] relu2 <- conv2 -I0906 13:30:04.769722 7951 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:30:04.769736 7951 net.cpp:121] Setting up relu2 -I0906 13:30:04.769744 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) -I0906 13:30:04.769748 7951 net.cpp:134] Memory required for data: 1504276480 -I0906 13:30:04.769752 7951 layer_factory.hpp:74] Creating layer norm2 -I0906 13:30:04.769769 7951 net.cpp:91] Creating Layer norm2 -I0906 13:30:04.769775 7951 net.cpp:411] norm2 <- conv2 -I0906 13:30:04.769788 7951 net.cpp:369] norm2 -> norm2 -I0906 13:30:04.769800 7951 net.cpp:121] Setting up norm2 -I0906 13:30:04.769820 7951 net.cpp:128] Top shape: 256 256 27 27 (47775744) -I0906 13:30:04.769825 7951 net.cpp:134] Memory required for data: 1695379456 -I0906 13:30:04.769829 7951 layer_factory.hpp:74] Creating layer pool2 -I0906 13:30:04.769850 7951 net.cpp:91] Creating Layer pool2 -I0906 13:30:04.769856 7951 net.cpp:411] pool2 <- norm2 -I0906 13:30:04.769870 7951 net.cpp:369] pool2 -> pool2 -I0906 13:30:04.769927 7951 net.cpp:121] Setting up pool2 -I0906 13:30:04.769944 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) -I0906 13:30:04.769949 7951 net.cpp:134] Memory required for data: 1739681792 -I0906 13:30:04.769953 7951 layer_factory.hpp:74] Creating layer conv3 -I0906 13:30:04.769975 7951 net.cpp:91] Creating Layer conv3 -I0906 13:30:04.769981 7951 net.cpp:411] conv3 <- pool2 -I0906 13:30:04.769996 7951 net.cpp:369] conv3 -> conv3 -I0906 13:30:04.770010 7951 net.cpp:121] Setting up conv3 -I0906 13:30:04.886401 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) -I0906 13:30:04.886425 7951 net.cpp:134] Memory required for data: 1806135296 -I0906 13:30:04.886471 7951 layer_factory.hpp:74] Creating layer relu3 -I0906 13:30:04.886507 7951 net.cpp:91] Creating Layer relu3 -I0906 13:30:04.886521 7951 net.cpp:411] relu3 <- conv3 -I0906 13:30:04.886548 7951 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:30:04.886565 7951 net.cpp:121] Setting up relu3 -I0906 13:30:04.886575 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) -I0906 13:30:04.886579 7951 net.cpp:134] Memory required for data: 1872588800 -I0906 13:30:04.886584 7951 layer_factory.hpp:74] Creating layer conv4 -I0906 13:30:04.886611 7951 net.cpp:91] Creating Layer conv4 -I0906 13:30:04.886617 7951 net.cpp:411] conv4 <- conv3 -I0906 13:30:04.886633 7951 net.cpp:369] conv4 -> conv4 -I0906 13:30:04.886648 7951 net.cpp:121] Setting up conv4 -I0906 13:30:04.973788 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) -I0906 13:30:04.973810 7951 net.cpp:134] Memory required for data: 1939042304 -I0906 13:30:04.973840 7951 layer_factory.hpp:74] Creating layer relu4 -I0906 13:30:04.973875 7951 net.cpp:91] Creating Layer relu4 -I0906 13:30:04.973891 7951 net.cpp:411] relu4 <- conv4 -I0906 13:30:04.973918 7951 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:30:04.973935 7951 net.cpp:121] Setting up relu4 -I0906 13:30:04.973945 7951 net.cpp:128] Top shape: 256 384 13 13 (16613376) -I0906 13:30:04.973949 7951 net.cpp:134] Memory required for data: 2005495808 -I0906 13:30:04.973954 7951 layer_factory.hpp:74] Creating layer conv5 -I0906 13:30:04.973980 7951 net.cpp:91] Creating Layer conv5 -I0906 13:30:04.973986 7951 net.cpp:411] conv5 <- conv4 -I0906 13:30:04.974004 7951 net.cpp:369] conv5 -> conv5 -I0906 13:30:04.974019 7951 net.cpp:121] Setting up conv5 -I0906 13:30:05.032649 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) -I0906 13:30:05.032670 7951 net.cpp:134] Memory required for data: 2049798144 -I0906 13:30:05.032712 7951 layer_factory.hpp:74] Creating layer relu5 -I0906 13:30:05.032747 7951 net.cpp:91] Creating Layer relu5 -I0906 13:30:05.032763 7951 net.cpp:411] relu5 <- conv5 -I0906 13:30:05.032788 7951 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:30:05.032805 7951 net.cpp:121] Setting up relu5 -I0906 13:30:05.032814 7951 net.cpp:128] Top shape: 256 256 13 13 (11075584) -I0906 13:30:05.032819 7951 net.cpp:134] Memory required for data: 2094100480 -I0906 13:30:05.032824 7951 layer_factory.hpp:74] Creating layer pool5 -I0906 13:30:05.032843 7951 net.cpp:91] Creating Layer pool5 -I0906 13:30:05.032850 7951 net.cpp:411] pool5 <- conv5 -I0906 13:30:05.032863 7951 net.cpp:369] pool5 -> pool5 -I0906 13:30:05.032877 7951 net.cpp:121] Setting up pool5 -I0906 13:30:05.032897 7951 net.cpp:128] Top shape: 256 256 6 6 (2359296) -I0906 13:30:05.032902 7951 net.cpp:134] Memory required for data: 2103537664 -I0906 13:30:05.032907 7951 layer_factory.hpp:74] Creating layer fc6 -I0906 13:30:05.032945 7951 net.cpp:91] Creating Layer fc6 -I0906 13:30:05.032951 7951 net.cpp:411] fc6 <- pool5 -I0906 13:30:05.032966 7951 net.cpp:369] fc6 -> fc6 -I0906 13:30:05.032980 7951 net.cpp:121] Setting up fc6 -I0906 13:30:05.203193 7955 data_layer.cpp:120] Prefetch batch: 478 ms. -I0906 13:30:05.203241 7955 data_layer.cpp:121] Read time: 65.301 ms. -I0906 13:30:05.203250 7955 data_layer.cpp:122] Transform time: 409.394 ms. -I0906 13:30:09.817406 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:09.817432 7951 net.cpp:134] Memory required for data: 2107731968 -I0906 13:30:09.817504 7951 layer_factory.hpp:74] Creating layer relu6 -I0906 13:30:09.817538 7951 net.cpp:91] Creating Layer relu6 -I0906 13:30:09.817553 7951 net.cpp:411] relu6 <- fc6 -I0906 13:30:09.817579 7951 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:30:09.817595 7951 net.cpp:121] Setting up relu6 -I0906 13:30:09.817605 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:09.817608 7951 net.cpp:134] Memory required for data: 2111926272 -I0906 13:30:09.817613 7951 layer_factory.hpp:74] Creating layer drop6 -I0906 13:30:09.817643 7951 net.cpp:91] Creating Layer drop6 -I0906 13:30:09.817649 7951 net.cpp:411] drop6 <- fc6 -I0906 13:30:09.817662 7951 net.cpp:358] drop6 -> fc6 (in-place) -I0906 13:30:09.817672 7951 net.cpp:121] Setting up drop6 -I0906 13:30:09.817692 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:09.817695 7951 net.cpp:134] Memory required for data: 2116120576 -I0906 13:30:09.817700 7951 layer_factory.hpp:74] Creating layer fc7 -I0906 13:30:09.817721 7951 net.cpp:91] Creating Layer fc7 -I0906 13:30:09.817728 7951 net.cpp:411] fc7 <- fc6 -I0906 13:30:09.817744 7951 net.cpp:369] fc7 -> fc7 -I0906 13:30:09.817759 7951 net.cpp:121] Setting up fc7 -I0906 13:30:11.938176 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:11.938201 7951 net.cpp:134] Memory required for data: 2120314880 -I0906 13:30:11.938230 7951 layer_factory.hpp:74] Creating layer relu7 -I0906 13:30:11.938263 7951 net.cpp:91] Creating Layer relu7 -I0906 13:30:11.938278 7951 net.cpp:411] relu7 <- fc7 -I0906 13:30:11.938305 7951 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:30:11.938321 7951 net.cpp:121] Setting up relu7 -I0906 13:30:11.938330 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:11.938334 7951 net.cpp:134] Memory required for data: 2124509184 -I0906 13:30:11.938339 7951 layer_factory.hpp:74] Creating layer drop7 -I0906 13:30:11.938355 7951 net.cpp:91] Creating Layer drop7 -I0906 13:30:11.938360 7951 net.cpp:411] drop7 <- fc7 -I0906 13:30:11.938372 7951 net.cpp:358] drop7 -> fc7 (in-place) -I0906 13:30:11.938382 7951 net.cpp:121] Setting up drop7 -I0906 13:30:11.938397 7951 net.cpp:128] Top shape: 256 4096 (1048576) -I0906 13:30:11.938401 7951 net.cpp:134] Memory required for data: 2128703488 -I0906 13:30:11.938406 7951 layer_factory.hpp:74] Creating layer fc8 -I0906 13:30:11.938427 7951 net.cpp:91] Creating Layer fc8 -I0906 13:30:11.938433 7951 net.cpp:411] fc8 <- fc7 -I0906 13:30:11.938449 7951 net.cpp:369] fc8 -> fc8 -I0906 13:30:11.938464 7951 net.cpp:121] Setting up fc8 -I0906 13:30:12.468230 7951 net.cpp:128] Top shape: 256 1000 (256000) -I0906 13:30:12.468251 7951 net.cpp:134] Memory required for data: 2129727488 -I0906 13:30:12.468279 7951 layer_factory.hpp:74] Creating layer loss -I0906 13:30:12.468333 7951 net.cpp:91] Creating Layer loss -I0906 13:30:12.468348 7951 net.cpp:411] loss <- fc8 -I0906 13:30:12.468370 7951 net.cpp:411] loss <- label -I0906 13:30:12.468389 7951 net.cpp:369] loss -> loss -I0906 13:30:12.468408 7951 net.cpp:121] Setting up loss -I0906 13:30:12.468426 7951 layer_factory.hpp:74] Creating layer loss -I0906 13:30:12.469732 7951 net.cpp:128] Top shape: (1) -I0906 13:30:12.469740 7951 net.cpp:130] with loss weight 1 -I0906 13:30:12.469756 7951 net.cpp:134] Memory required for data: 2129727492 -I0906 13:30:12.469769 7951 net.cpp:193] loss needs backward computation. -I0906 13:30:12.469779 7951 net.cpp:193] fc8 needs backward computation. -I0906 13:30:12.469784 7951 net.cpp:193] drop7 needs backward computation. -I0906 13:30:12.469791 7951 net.cpp:193] relu7 needs backward computation. -I0906 13:30:12.469796 7951 net.cpp:193] fc7 needs backward computation. -I0906 13:30:12.469808 7951 net.cpp:193] drop6 needs backward computation. -I0906 13:30:12.469815 7951 net.cpp:193] relu6 needs backward computation. -I0906 13:30:12.469820 7951 net.cpp:193] fc6 needs backward computation. -I0906 13:30:12.469825 7951 net.cpp:193] pool5 needs backward computation. -I0906 13:30:12.469830 7951 net.cpp:193] relu5 needs backward computation. -I0906 13:30:12.469835 7951 net.cpp:193] conv5 needs backward computation. -I0906 13:30:12.469882 7951 net.cpp:193] relu4 needs backward computation. -I0906 13:30:12.469887 7951 net.cpp:193] conv4 needs backward computation. -I0906 13:30:12.469893 7951 net.cpp:193] relu3 needs backward computation. -I0906 13:30:12.469899 7951 net.cpp:193] conv3 needs backward computation. -I0906 13:30:12.469907 7951 net.cpp:193] pool2 needs backward computation. -I0906 13:30:12.469913 7951 net.cpp:193] norm2 needs backward computation. -I0906 13:30:12.469918 7951 net.cpp:193] relu2 needs backward computation. -I0906 13:30:12.469924 7951 net.cpp:193] conv2 needs backward computation. -I0906 13:30:12.469930 7951 net.cpp:193] pool1 needs backward computation. -I0906 13:30:12.469936 7951 net.cpp:193] norm1 needs backward computation. -I0906 13:30:12.469943 7951 net.cpp:193] relu1 needs backward computation. -I0906 13:30:12.469949 7951 net.cpp:193] conv1 needs backward computation. -I0906 13:30:12.469955 7951 net.cpp:195] data does not need backward computation. -I0906 13:30:12.469962 7951 net.cpp:236] This network produces output loss -I0906 13:30:12.470002 7951 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:30:12.470018 7951 net.cpp:248] Network initialization done. -I0906 13:30:12.470022 7951 net.cpp:249] Memory required for data: 2129727492 -I0906 13:30:12.470949 7951 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val.prototxt -I0906 13:30:12.471081 7951 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data -I0906 13:30:12.471318 7951 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TEST -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "drop6" - type: "Dropout" - bottom: "fc6" - top: "fc6" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "drop7" - type: "Dropout" - bottom: "fc7" - top: "fc7" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "fc8" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:30:12.471688 7951 net.cpp:68] Memory required for data: 0 -I0906 13:30:12.471739 7951 layer_factory.hpp:74] Creating layer data -I0906 13:30:12.471761 7951 net.cpp:91] Creating Layer data -I0906 13:30:12.471772 7951 net.cpp:369] data -> data -I0906 13:30:12.471796 7951 net.cpp:369] data -> label -I0906 13:30:12.471810 7951 net.cpp:121] Setting up data -I0906 13:30:12.471817 7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:30:12.482815 7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb -I0906 13:30:12.483065 7951 data_layer.cpp:53] output data size: 50,3,227,227 -I0906 13:30:12.546061 7951 base_data_layer.cpp:43] Initializing prefetch -I0906 13:30:12.546188 7951 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:30:12.546222 7951 net.cpp:128] Top shape: 50 3 227 227 (7729350) -I0906 13:30:12.546231 7951 net.cpp:128] Top shape: 50 (50) -I0906 13:30:12.546236 7951 net.cpp:134] Memory required for data: 30917600 -I0906 13:30:12.546268 7951 layer_factory.hpp:74] Creating layer label_data_1_split -I0906 13:30:12.546334 7951 net.cpp:91] Creating Layer label_data_1_split -I0906 13:30:12.546380 7951 net.cpp:411] label_data_1_split <- label -I0906 13:30:12.546419 7951 net.cpp:369] label_data_1_split -> label_data_1_split_0 -I0906 13:30:12.546460 7951 net.cpp:369] label_data_1_split -> label_data_1_split_1 -I0906 13:30:12.546520 7951 net.cpp:121] Setting up label_data_1_split -I0906 13:30:12.546551 7951 net.cpp:128] Top shape: 50 (50) -I0906 13:30:12.546558 7951 net.cpp:128] Top shape: 50 (50) -I0906 13:30:12.546561 7951 net.cpp:134] Memory required for data: 30918000 -I0906 13:30:12.546567 7951 layer_factory.hpp:74] Creating layer conv1 -I0906 13:30:12.546602 7951 net.cpp:91] Creating Layer conv1 -I0906 13:30:12.546608 7951 net.cpp:411] conv1 <- data -I0906 13:30:12.546624 7951 net.cpp:369] conv1 -> conv1 -I0906 13:30:12.546638 7951 net.cpp:121] Setting up conv1 -I0906 13:30:12.551349 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:30:12.551354 7951 net.cpp:134] Memory required for data: 88998000 -I0906 13:30:12.551374 7951 layer_factory.hpp:74] Creating layer relu1 -I0906 13:30:12.551388 7951 net.cpp:91] Creating Layer relu1 -I0906 13:30:12.551393 7951 net.cpp:411] relu1 <- conv1 -I0906 13:30:12.551405 7951 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:30:12.551415 7951 net.cpp:121] Setting up relu1 -I0906 13:30:12.551422 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:30:12.551426 7951 net.cpp:134] Memory required for data: 147078000 -I0906 13:30:12.551431 7951 layer_factory.hpp:74] Creating layer norm1 -I0906 13:30:12.551451 7951 net.cpp:91] Creating Layer norm1 -I0906 13:30:12.551457 7951 net.cpp:411] norm1 <- conv1 -I0906 13:30:12.551470 7951 net.cpp:369] norm1 -> norm1 -I0906 13:30:12.551481 7951 net.cpp:121] Setting up norm1 -I0906 13:30:12.551499 7951 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:30:12.551504 7951 net.cpp:134] Memory required for data: 205158000 -I0906 13:30:12.551508 7951 layer_factory.hpp:74] Creating layer pool1 -I0906 13:30:12.551524 7951 net.cpp:91] Creating Layer pool1 -I0906 13:30:12.551530 7951 net.cpp:411] pool1 <- norm1 -I0906 13:30:12.551543 7951 net.cpp:369] pool1 -> pool1 -I0906 13:30:12.551553 7951 net.cpp:121] Setting up pool1 -I0906 13:30:12.551571 7951 net.cpp:128] Top shape: 50 96 27 27 (3499200) -I0906 13:30:12.551576 7951 net.cpp:134] Memory required for data: 219154800 -I0906 13:30:12.551580 7951 layer_factory.hpp:74] Creating layer conv2 -I0906 13:30:12.551594 7951 net.cpp:91] Creating Layer conv2 -I0906 13:30:12.551600 7951 net.cpp:411] conv2 <- pool1 -I0906 13:30:12.551615 7951 net.cpp:369] conv2 -> conv2 -I0906 13:30:12.551627 7951 net.cpp:121] Setting up conv2 -I0906 13:30:12.591382 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:30:12.591404 7951 net.cpp:134] Memory required for data: 256479600 -I0906 13:30:12.591442 7951 layer_factory.hpp:74] Creating layer relu2 -I0906 13:30:12.591473 7951 net.cpp:91] Creating Layer relu2 -I0906 13:30:12.591486 7951 net.cpp:411] relu2 <- conv2 -I0906 13:30:12.591511 7951 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:30:12.591526 7951 net.cpp:121] Setting up relu2 -I0906 13:30:12.591536 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:30:12.591539 7951 net.cpp:134] Memory required for data: 293804400 -I0906 13:30:12.591544 7951 layer_factory.hpp:74] Creating layer norm2 -I0906 13:30:12.591572 7951 net.cpp:91] Creating Layer norm2 -I0906 13:30:12.591578 7951 net.cpp:411] norm2 <- conv2 -I0906 13:30:12.591591 7951 net.cpp:369] norm2 -> norm2 -I0906 13:30:12.591609 7951 net.cpp:121] Setting up norm2 -I0906 13:30:12.591629 7951 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:30:12.591634 7951 net.cpp:134] Memory required for data: 331129200 -I0906 13:30:12.591639 7951 layer_factory.hpp:74] Creating layer pool2 -I0906 13:30:12.591657 7951 net.cpp:91] Creating Layer pool2 -I0906 13:30:12.591663 7951 net.cpp:411] pool2 <- norm2 -I0906 13:30:12.591676 7951 net.cpp:369] pool2 -> pool2 -I0906 13:30:12.591687 7951 net.cpp:121] Setting up pool2 -I0906 13:30:12.591706 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:30:12.591709 7951 net.cpp:134] Memory required for data: 339782000 -I0906 13:30:12.591714 7951 layer_factory.hpp:74] Creating layer conv3 -I0906 13:30:12.591739 7951 net.cpp:91] Creating Layer conv3 -I0906 13:30:12.591744 7951 net.cpp:411] conv3 <- pool2 -I0906 13:30:12.591802 7951 net.cpp:369] conv3 -> conv3 -I0906 13:30:12.591814 7951 net.cpp:121] Setting up conv3 -I0906 13:30:12.640625 7956 data_layer.cpp:120] Prefetch batch: 94 ms. -I0906 13:30:12.640658 7956 data_layer.cpp:121] Read time: 12.07 ms. -I0906 13:30:12.640666 7956 data_layer.cpp:122] Transform time: 81.163 ms. -I0906 13:30:12.705313 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:30:12.705337 7951 net.cpp:134] Memory required for data: 352761200 -I0906 13:30:12.705377 7951 layer_factory.hpp:74] Creating layer relu3 -I0906 13:30:12.705410 7951 net.cpp:91] Creating Layer relu3 -I0906 13:30:12.705425 7951 net.cpp:411] relu3 <- conv3 -I0906 13:30:12.705451 7951 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:30:12.705466 7951 net.cpp:121] Setting up relu3 -I0906 13:30:12.705476 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:30:12.705479 7951 net.cpp:134] Memory required for data: 365740400 -I0906 13:30:12.705484 7951 layer_factory.hpp:74] Creating layer conv4 -I0906 13:30:12.705512 7951 net.cpp:91] Creating Layer conv4 -I0906 13:30:12.705518 7951 net.cpp:411] conv4 <- conv3 -I0906 13:30:12.705534 7951 net.cpp:369] conv4 -> conv4 -I0906 13:30:12.705549 7951 net.cpp:121] Setting up conv4 -I0906 13:30:12.789549 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:30:12.789571 7951 net.cpp:134] Memory required for data: 378719600 -I0906 13:30:12.789597 7951 layer_factory.hpp:74] Creating layer relu4 -I0906 13:30:12.789631 7951 net.cpp:91] Creating Layer relu4 -I0906 13:30:12.789646 7951 net.cpp:411] relu4 <- conv4 -I0906 13:30:12.789674 7951 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:30:12.789690 7951 net.cpp:121] Setting up relu4 -I0906 13:30:12.789698 7951 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:30:12.789701 7951 net.cpp:134] Memory required for data: 391698800 -I0906 13:30:12.789706 7951 layer_factory.hpp:74] Creating layer conv5 -I0906 13:30:12.789732 7951 net.cpp:91] Creating Layer conv5 -I0906 13:30:12.789738 7951 net.cpp:411] conv5 <- conv4 -I0906 13:30:12.789754 7951 net.cpp:369] conv5 -> conv5 -I0906 13:30:12.789770 7951 net.cpp:121] Setting up conv5 -I0906 13:30:12.846217 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:30:12.846233 7951 net.cpp:134] Memory required for data: 400351600 -I0906 13:30:12.846271 7951 layer_factory.hpp:74] Creating layer relu5 -I0906 13:30:12.846298 7951 net.cpp:91] Creating Layer relu5 -I0906 13:30:12.846312 7951 net.cpp:411] relu5 <- conv5 -I0906 13:30:12.846335 7951 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:30:12.846350 7951 net.cpp:121] Setting up relu5 -I0906 13:30:12.846359 7951 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:30:12.846362 7951 net.cpp:134] Memory required for data: 409004400 -I0906 13:30:12.846367 7951 layer_factory.hpp:74] Creating layer pool5 -I0906 13:30:12.846397 7951 net.cpp:91] Creating Layer pool5 -I0906 13:30:12.846402 7951 net.cpp:411] pool5 <- conv5 -I0906 13:30:12.846417 7951 net.cpp:369] pool5 -> pool5 -I0906 13:30:12.846431 7951 net.cpp:121] Setting up pool5 -I0906 13:30:12.846451 7951 net.cpp:128] Top shape: 50 256 6 6 (460800) -I0906 13:30:12.846454 7951 net.cpp:134] Memory required for data: 410847600 -I0906 13:30:12.846459 7951 layer_factory.hpp:74] Creating layer fc6 -I0906 13:30:12.846479 7951 net.cpp:91] Creating Layer fc6 -I0906 13:30:12.846485 7951 net.cpp:411] fc6 <- pool5 -I0906 13:30:12.846499 7951 net.cpp:369] fc6 -> fc6 -I0906 13:30:12.846513 7951 net.cpp:121] Setting up fc6 -I0906 13:30:17.661206 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:17.661231 7951 net.cpp:134] Memory required for data: 411666800 -I0906 13:30:17.661259 7951 layer_factory.hpp:74] Creating layer relu6 -I0906 13:30:17.661293 7951 net.cpp:91] Creating Layer relu6 -I0906 13:30:17.661309 7951 net.cpp:411] relu6 <- fc6 -I0906 13:30:17.661334 7951 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:30:17.661350 7951 net.cpp:121] Setting up relu6 -I0906 13:30:17.661360 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:17.661363 7951 net.cpp:134] Memory required for data: 412486000 -I0906 13:30:17.661412 7951 layer_factory.hpp:74] Creating layer drop6 -I0906 13:30:17.661428 7951 net.cpp:91] Creating Layer drop6 -I0906 13:30:17.661434 7951 net.cpp:411] drop6 <- fc6 -I0906 13:30:17.661447 7951 net.cpp:358] drop6 -> fc6 (in-place) -I0906 13:30:17.661456 7951 net.cpp:121] Setting up drop6 -I0906 13:30:17.661470 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:17.661475 7951 net.cpp:134] Memory required for data: 413305200 -I0906 13:30:17.661480 7951 layer_factory.hpp:74] Creating layer fc7 -I0906 13:30:17.661501 7951 net.cpp:91] Creating Layer fc7 -I0906 13:30:17.661507 7951 net.cpp:411] fc7 <- fc6 -I0906 13:30:17.661523 7951 net.cpp:369] fc7 -> fc7 -I0906 13:30:17.661540 7951 net.cpp:121] Setting up fc7 -I0906 13:30:19.790464 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:19.790488 7951 net.cpp:134] Memory required for data: 414124400 -I0906 13:30:19.790514 7951 layer_factory.hpp:74] Creating layer relu7 -I0906 13:30:19.790547 7951 net.cpp:91] Creating Layer relu7 -I0906 13:30:19.790563 7951 net.cpp:411] relu7 <- fc7 -I0906 13:30:19.790591 7951 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:30:19.790607 7951 net.cpp:121] Setting up relu7 -I0906 13:30:19.790616 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:19.790621 7951 net.cpp:134] Memory required for data: 414943600 -I0906 13:30:19.790624 7951 layer_factory.hpp:74] Creating layer drop7 -I0906 13:30:19.790639 7951 net.cpp:91] Creating Layer drop7 -I0906 13:30:19.790645 7951 net.cpp:411] drop7 <- fc7 -I0906 13:30:19.790657 7951 net.cpp:358] drop7 -> fc7 (in-place) -I0906 13:30:19.790668 7951 net.cpp:121] Setting up drop7 -I0906 13:30:19.790683 7951 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:30:19.790688 7951 net.cpp:134] Memory required for data: 415762800 -I0906 13:30:19.790691 7951 layer_factory.hpp:74] Creating layer fc8 -I0906 13:30:19.790714 7951 net.cpp:91] Creating Layer fc8 -I0906 13:30:19.790719 7951 net.cpp:411] fc8 <- fc7 -I0906 13:30:19.790735 7951 net.cpp:369] fc8 -> fc8 -I0906 13:30:19.790760 7951 net.cpp:121] Setting up fc8 -I0906 13:30:20.310474 7951 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:30:20.310497 7951 net.cpp:134] Memory required for data: 415962800 -I0906 13:30:20.310523 7951 layer_factory.hpp:74] Creating layer fc8_fc8_0_split -I0906 13:30:20.310555 7951 net.cpp:91] Creating Layer fc8_fc8_0_split -I0906 13:30:20.310570 7951 net.cpp:411] fc8_fc8_0_split <- fc8 -I0906 13:30:20.310598 7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 -I0906 13:30:20.310621 7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 -I0906 13:30:20.310633 7951 net.cpp:121] Setting up fc8_fc8_0_split -I0906 13:30:20.310650 7951 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:30:20.310657 7951 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:30:20.310660 7951 net.cpp:134] Memory required for data: 416362800 -I0906 13:30:20.310665 7951 layer_factory.hpp:74] Creating layer accuracy -I0906 13:30:20.310698 7951 net.cpp:91] Creating Layer accuracy -I0906 13:30:20.310704 7951 net.cpp:411] accuracy <- fc8_fc8_0_split_0 -I0906 13:30:20.310715 7951 net.cpp:411] accuracy <- label_data_1_split_0 -I0906 13:30:20.310729 7951 net.cpp:369] accuracy -> accuracy -I0906 13:30:20.310740 7951 net.cpp:121] Setting up accuracy -I0906 13:30:20.310756 7951 net.cpp:128] Top shape: (1) -I0906 13:30:20.310760 7951 net.cpp:134] Memory required for data: 416362804 -I0906 13:30:20.310765 7951 layer_factory.hpp:74] Creating layer loss -I0906 13:30:20.310777 7951 net.cpp:91] Creating Layer loss -I0906 13:30:20.310782 7951 net.cpp:411] loss <- fc8_fc8_0_split_1 -I0906 13:30:20.310793 7951 net.cpp:411] loss <- label_data_1_split_1 -I0906 13:30:20.310804 7951 net.cpp:369] loss -> loss -I0906 13:30:20.310816 7951 net.cpp:121] Setting up loss -I0906 13:30:20.310825 7951 layer_factory.hpp:74] Creating layer loss -I0906 13:30:20.311178 7951 net.cpp:128] Top shape: (1) -I0906 13:30:20.311183 7951 net.cpp:130] with loss weight 1 -I0906 13:30:20.311200 7951 net.cpp:134] Memory required for data: 416362808 -I0906 13:30:20.311250 7951 net.cpp:193] loss needs backward computation. -I0906 13:30:20.311259 7951 net.cpp:195] accuracy does not need backward computation. -I0906 13:30:20.311265 7951 net.cpp:193] fc8_fc8_0_split needs backward computation. -I0906 13:30:20.311271 7951 net.cpp:193] fc8 needs backward computation. -I0906 13:30:20.311277 7951 net.cpp:193] drop7 needs backward computation. -I0906 13:30:20.311282 7951 net.cpp:193] relu7 needs backward computation. -I0906 13:30:20.311288 7951 net.cpp:193] fc7 needs backward computation. -I0906 13:30:20.311295 7951 net.cpp:193] drop6 needs backward computation. -I0906 13:30:20.311300 7951 net.cpp:193] relu6 needs backward computation. -I0906 13:30:20.311305 7951 net.cpp:193] fc6 needs backward computation. -I0906 13:30:20.311311 7951 net.cpp:193] pool5 needs backward computation. -I0906 13:30:20.311317 7951 net.cpp:193] relu5 needs backward computation. -I0906 13:30:20.311322 7951 net.cpp:193] conv5 needs backward computation. -I0906 13:30:20.311328 7951 net.cpp:193] relu4 needs backward computation. -I0906 13:30:20.311333 7951 net.cpp:193] conv4 needs backward computation. -I0906 13:30:20.311339 7951 net.cpp:193] relu3 needs backward computation. -I0906 13:30:20.311345 7951 net.cpp:193] conv3 needs backward computation. -I0906 13:30:20.311352 7951 net.cpp:193] pool2 needs backward computation. -I0906 13:30:20.311357 7951 net.cpp:193] norm2 needs backward computation. -I0906 13:30:20.311363 7951 net.cpp:193] relu2 needs backward computation. -I0906 13:30:20.311368 7951 net.cpp:193] conv2 needs backward computation. -I0906 13:30:20.311374 7951 net.cpp:193] pool1 needs backward computation. -I0906 13:30:20.311380 7951 net.cpp:193] norm1 needs backward computation. -I0906 13:30:20.311386 7951 net.cpp:193] relu1 needs backward computation. -I0906 13:30:20.311391 7951 net.cpp:193] conv1 needs backward computation. -I0906 13:30:20.311399 7951 net.cpp:195] label_data_1_split does not need backward computation. -I0906 13:30:20.311406 7951 net.cpp:195] data does not need backward computation. -I0906 13:30:20.311411 7951 net.cpp:236] This network produces output accuracy -I0906 13:30:20.311419 7951 net.cpp:236] This network produces output loss -I0906 13:30:20.311455 7951 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:30:20.311468 7951 net.cpp:248] Network initialization done. -I0906 13:30:20.311472 7951 net.cpp:249] Memory required for data: 416362808 -I0906 13:30:20.311663 7951 solver.cpp:53] Solver scaffolding done. -I0906 13:30:20.311787 7951 solver.cpp:270] Solving AlexNet -I0906 13:30:20.311791 7951 solver.cpp:271] Learning Rate Policy: step -I0906 13:30:20.313592 7951 solver.cpp:314] Iteration 0, Testing net (#0) -I0906 13:30:20.313630 7951 net.cpp:696] Copying source layer data -I0906 13:30:20.313635 7951 net.cpp:696] Copying source layer conv1 -I0906 13:30:20.316704 7951 net.cpp:696] Copying source layer relu1 -I0906 13:30:20.316743 7951 net.cpp:696] Copying source layer norm1 -I0906 13:30:20.316756 7951 net.cpp:696] Copying source layer pool1 -I0906 13:30:20.316766 7951 net.cpp:696] Copying source layer conv2 -I0906 13:30:20.317158 7951 net.cpp:696] Copying source layer relu2 -I0906 13:30:20.317173 7951 net.cpp:696] Copying source layer norm2 -I0906 13:30:20.317183 7951 net.cpp:696] Copying source layer pool2 -I0906 13:30:20.317193 7951 net.cpp:696] Copying source layer conv3 -I0906 13:30:20.317970 7951 net.cpp:696] Copying source layer relu3 -I0906 13:30:20.317983 7951 net.cpp:696] Copying source layer conv4 -I0906 13:30:20.318357 7951 net.cpp:696] Copying source layer relu4 -I0906 13:30:20.318372 7951 net.cpp:696] Copying source layer conv5 -I0906 13:30:20.318827 7951 net.cpp:696] Copying source layer relu5 -I0906 13:30:20.318840 7951 net.cpp:696] Copying source layer pool5 -I0906 13:30:20.318850 7951 net.cpp:696] Copying source layer fc6 -I0906 13:30:20.336436 7951 net.cpp:696] Copying source layer relu6 -I0906 13:30:20.336460 7951 net.cpp:696] Copying source layer drop6 -I0906 13:30:20.336467 7951 net.cpp:696] Copying sou \ No newline at end of file diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 deleted file mode 100644 index b99da3d4..00000000 --- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 +++ /dev/null @@ -1,1208 +0,0 @@ -Log file created at: 2015/09/06 13:33:58 -Running on machine: AMD-RESEARCH -Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -I0906 13:33:58.858449 8300 caffe.cpp:114] Use GPU with device ID 0 -I0906 13:33:58.896994 8300 device.cpp:230] Number of platforms found:1 -I0906 13:33:58.897037 8300 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing -I0906 13:33:58.897054 8300 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE -I0906 13:33:58.897061 8300 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) -I0906 13:33:58.897068 8300 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. -I0906 13:33:58.897075 8300 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices -I0906 13:33:58.897086 8300 device.cpp:286] Number of devices found:1 -I0906 13:33:58.897092 8300 device.cpp:288] DeviceID: 0x163a250 -I0906 13:33:58.897126 8300 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU -I0906 13:33:58.897137 8300 device.cpp:393] Is it integrated GPU?: 0 -I0906 13:33:58.897145 8300 device.cpp:393] Max clock frequency MHz: 930 -I0906 13:33:58.897151 8300 device.cpp:393] Host-Device unified mem: 0 -I0906 13:33:58.897157 8300 device.cpp:393] ECC support: 0 -I0906 13:33:58.897164 8300 device.cpp:393] Endian little: 1 -I0906 13:33:58.897171 8300 device.cpp:393] Max compute units: 44 -I0906 13:33:58.897177 8300 device.cpp:393] Max work group size: 256 -I0906 13:33:58.897186 8300 device.cpp:393] Max work item dimensions: 3 -I0906 13:33:58.897192 8300 device.cpp:393] Max work item sizes: 0x100 -I0906 13:33:58.897202 8300 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE -I0906 13:33:58.897209 8300 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL -I0906 13:33:58.897215 8300 device.cpp:393] Max mem alloc size: 4244635648 -I0906 13:33:58.897222 8300 device.cpp:393] Global mem size: 16878927872 -I0906 13:33:58.897228 8300 device.cpp:393] Local mem size: 32768 -I0906 13:33:58.897241 8300 device.cpp:96] Picked device type : GPU 0 -I0906 13:34:01.301823 8300 device.cpp:152] Build Program -I0906 13:34:01.302049 8300 caffe.cpp:122] Starting Optimization -I0906 13:34:01.302139 8300 solver.cpp:40] Initializing solver from parameters: -test_iter: 1 -test_interval: 1000 -base_lr: 0.01 -display: 1 -max_iter: 10 -lr_policy: "step" -gamma: 0.1 -momentum: 0.9 -weight_decay: 0.0005 -stepsize: 100000 -snapshot: 10000 -snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" -solver_mode: GPU -net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" -I0906 13:34:01.302249 8300 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:34:01.303269 8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data -I0906 13:34:01.303316 8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy -I0906 13:34:01.303493 8300 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TRAIN -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:34:01.303913 8300 net.cpp:68] Memory required for data: 0 -I0906 13:34:01.304132 8300 layer_factory.hpp:74] Creating layer data -I0906 13:34:01.304185 8300 net.cpp:91] Creating Layer data -I0906 13:34:01.304205 8300 net.cpp:369] data -> data -I0906 13:34:01.304306 8300 net.cpp:369] data -> label -I0906 13:34:01.304328 8300 net.cpp:121] Setting up data -I0906 13:34:01.304342 8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:34:01.318087 8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb -I0906 13:34:01.318596 8300 data_layer.cpp:53] output data size: 100,3,227,227 -I0906 13:34:01.351816 8300 base_data_layer.cpp:43] Initializing prefetch -I0906 13:34:01.352555 8300 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:34:01.352643 8300 net.cpp:128] Top shape: 100 3 227 227 (15458700) -I0906 13:34:01.352655 8300 net.cpp:128] Top shape: 100 (100) -I0906 13:34:01.352660 8300 net.cpp:134] Memory required for data: 61835200 -I0906 13:34:01.352697 8300 layer_factory.hpp:74] Creating layer conv1 -I0906 13:34:01.352783 8300 net.cpp:91] Creating Layer conv1 -I0906 13:34:01.352808 8300 net.cpp:411] conv1 <- data -I0906 13:34:01.352902 8300 net.cpp:369] conv1 -> conv1 -I0906 13:34:01.352937 8300 net.cpp:121] Setting up conv1 -I0906 13:34:01.357744 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:01.357751 8300 net.cpp:134] Memory required for data: 177995200 -I0906 13:34:01.357791 8300 layer_factory.hpp:74] Creating layer relu1 -I0906 13:34:01.357815 8300 net.cpp:91] Creating Layer relu1 -I0906 13:34:01.357820 8300 net.cpp:411] relu1 <- conv1 -I0906 13:34:01.357833 8300 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:34:01.357843 8300 net.cpp:121] Setting up relu1 -I0906 13:34:01.357851 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:01.357856 8300 net.cpp:134] Memory required for data: 294155200 -I0906 13:34:01.357861 8300 layer_factory.hpp:74] Creating layer norm1 -I0906 13:34:01.357890 8300 net.cpp:91] Creating Layer norm1 -I0906 13:34:01.357895 8300 net.cpp:411] norm1 <- conv1 -I0906 13:34:01.357908 8300 net.cpp:369] norm1 -> norm1 -I0906 13:34:01.357920 8300 net.cpp:121] Setting up norm1 -I0906 13:34:01.357944 8300 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:01.357949 8300 net.cpp:134] Memory required for data: 410315200 -I0906 13:34:01.357954 8300 layer_factory.hpp:74] Creating layer pool1 -I0906 13:34:01.357978 8300 net.cpp:91] Creating Layer pool1 -I0906 13:34:01.357985 8300 net.cpp:411] pool1 <- norm1 -I0906 13:34:01.357996 8300 net.cpp:369] pool1 -> pool1 -I0906 13:34:01.358010 8300 net.cpp:121] Setting up pool1 -I0906 13:34:01.358038 8300 net.cpp:128] Top shape: 100 96 27 27 (6998400) -I0906 13:34:01.358042 8300 net.cpp:134] Memory required for data: 438308800 -I0906 13:34:01.358047 8300 layer_factory.hpp:74] Creating layer conv2 -I0906 13:34:01.358060 8300 net.cpp:91] Creating Layer conv2 -I0906 13:34:01.358067 8300 net.cpp:411] conv2 <- pool1 -I0906 13:34:01.358079 8300 net.cpp:369] conv2 -> conv2 -I0906 13:34:01.358091 8300 net.cpp:121] Setting up conv2 -I0906 13:34:01.397493 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:01.397511 8300 net.cpp:134] Memory required for data: 512958400 -I0906 13:34:01.397541 8300 layer_factory.hpp:74] Creating layer relu2 -I0906 13:34:01.397567 8300 net.cpp:91] Creating Layer relu2 -I0906 13:34:01.397578 8300 net.cpp:411] relu2 <- conv2 -I0906 13:34:01.397599 8300 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:34:01.397613 8300 net.cpp:121] Setting up relu2 -I0906 13:34:01.397621 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:01.397626 8300 net.cpp:134] Memory required for data: 587608000 -I0906 13:34:01.397631 8300 layer_factory.hpp:74] Creating layer norm2 -I0906 13:34:01.397649 8300 net.cpp:91] Creating Layer norm2 -I0906 13:34:01.397655 8300 net.cpp:411] norm2 <- conv2 -I0906 13:34:01.397667 8300 net.cpp:369] norm2 -> norm2 -I0906 13:34:01.397680 8300 net.cpp:121] Setting up norm2 -I0906 13:34:01.397699 8300 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:01.397704 8300 net.cpp:134] Memory required for data: 662257600 -I0906 13:34:01.397709 8300 layer_factory.hpp:74] Creating layer pool2 -I0906 13:34:01.397729 8300 net.cpp:91] Creating Layer pool2 -I0906 13:34:01.397735 8300 net.cpp:411] pool2 <- norm2 -I0906 13:34:01.397748 8300 net.cpp:369] pool2 -> pool2 -I0906 13:34:01.397758 8300 net.cpp:121] Setting up pool2 -I0906 13:34:01.397776 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:01.397780 8300 net.cpp:134] Memory required for data: 679563200 -I0906 13:34:01.397830 8300 layer_factory.hpp:74] Creating layer conv3 -I0906 13:34:01.397851 8300 net.cpp:91] Creating Layer conv3 -I0906 13:34:01.397857 8300 net.cpp:411] conv3 <- pool2 -I0906 13:34:01.397871 8300 net.cpp:369] conv3 -> conv3 -I0906 13:34:01.397886 8300 net.cpp:121] Setting up conv3 -I0906 13:34:01.513005 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:01.513030 8300 net.cpp:134] Memory required for data: 705521600 -I0906 13:34:01.513072 8300 layer_factory.hpp:74] Creating layer relu3 -I0906 13:34:01.513104 8300 net.cpp:91] Creating Layer relu3 -I0906 13:34:01.513120 8300 net.cpp:411] relu3 <- conv3 -I0906 13:34:01.513149 8300 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:34:01.513164 8300 net.cpp:121] Setting up relu3 -I0906 13:34:01.513173 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:01.513177 8300 net.cpp:134] Memory required for data: 731480000 -I0906 13:34:01.513182 8300 layer_factory.hpp:74] Creating layer conv4 -I0906 13:34:01.513208 8300 net.cpp:91] Creating Layer conv4 -I0906 13:34:01.513214 8300 net.cpp:411] conv4 <- conv3 -I0906 13:34:01.513229 8300 net.cpp:369] conv4 -> conv4 -I0906 13:34:01.513244 8300 net.cpp:121] Setting up conv4 -I0906 13:34:01.539248 8304 data_layer.cpp:120] Prefetch batch: 186 ms. -I0906 13:34:01.539295 8304 data_layer.cpp:121] Read time: 22.695 ms. -I0906 13:34:01.539304 8304 data_layer.cpp:122] Transform time: 161.707 ms. -I0906 13:34:01.598980 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:01.599004 8300 net.cpp:134] Memory required for data: 757438400 -I0906 13:34:01.599028 8300 layer_factory.hpp:74] Creating layer relu4 -I0906 13:34:01.599059 8300 net.cpp:91] Creating Layer relu4 -I0906 13:34:01.599074 8300 net.cpp:411] relu4 <- conv4 -I0906 13:34:01.599100 8300 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:34:01.599117 8300 net.cpp:121] Setting up relu4 -I0906 13:34:01.599125 8300 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:01.599129 8300 net.cpp:134] Memory required for data: 783396800 -I0906 13:34:01.599134 8300 layer_factory.hpp:74] Creating layer conv5 -I0906 13:34:01.599158 8300 net.cpp:91] Creating Layer conv5 -I0906 13:34:01.599164 8300 net.cpp:411] conv5 <- conv4 -I0906 13:34:01.599177 8300 net.cpp:369] conv5 -> conv5 -I0906 13:34:01.599191 8300 net.cpp:121] Setting up conv5 -I0906 13:34:01.658185 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:01.658205 8300 net.cpp:134] Memory required for data: 800702400 -I0906 13:34:01.658242 8300 layer_factory.hpp:74] Creating layer relu5 -I0906 13:34:01.658269 8300 net.cpp:91] Creating Layer relu5 -I0906 13:34:01.658283 8300 net.cpp:411] relu5 <- conv5 -I0906 13:34:01.658308 8300 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:34:01.658321 8300 net.cpp:121] Setting up relu5 -I0906 13:34:01.658330 8300 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:01.658334 8300 net.cpp:134] Memory required for data: 818008000 -I0906 13:34:01.658339 8300 layer_factory.hpp:74] Creating layer pool5 -I0906 13:34:01.658357 8300 net.cpp:91] Creating Layer pool5 -I0906 13:34:01.658362 8300 net.cpp:411] pool5 <- conv5 -I0906 13:34:01.658375 8300 net.cpp:369] pool5 -> pool5 -I0906 13:34:01.658390 8300 net.cpp:121] Setting up pool5 -I0906 13:34:01.658407 8300 net.cpp:128] Top shape: 100 256 6 6 (921600) -I0906 13:34:01.658412 8300 net.cpp:134] Memory required for data: 821694400 -I0906 13:34:01.658416 8300 layer_factory.hpp:74] Creating layer fc6 -I0906 13:34:01.658447 8300 net.cpp:91] Creating Layer fc6 -I0906 13:34:01.658453 8300 net.cpp:411] fc6 <- pool5 -I0906 13:34:01.658466 8300 net.cpp:369] fc6 -> fc6 -I0906 13:34:01.658480 8300 net.cpp:121] Setting up fc6 -I0906 13:34:06.571331 8300 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:06.571354 8300 net.cpp:134] Memory required for data: 823332800 -I0906 13:34:06.571382 8300 layer_factory.hpp:74] Creating layer relu6 -I0906 13:34:06.571415 8300 net.cpp:91] Creating Layer relu6 -I0906 13:34:06.571430 8300 net.cpp:411] relu6 <- fc6 -I0906 13:34:06.571456 8300 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:34:06.571521 8300 net.cpp:121] Setting up relu6 -I0906 13:34:06.571529 8300 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:06.571533 8300 net.cpp:134] Memory required for data: 824971200 -I0906 13:34:06.571538 8300 layer_factory.hpp:74] Creating layer fc7 -I0906 13:34:06.571558 8300 net.cpp:91] Creating Layer fc7 -I0906 13:34:06.571563 8300 net.cpp:411] fc7 <- fc6 -I0906 13:34:06.571578 8300 net.cpp:369] fc7 -> fc7 -I0906 13:34:06.571593 8300 net.cpp:121] Setting up fc7 -I0906 13:34:08.751106 8300 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:08.751129 8300 net.cpp:134] Memory required for data: 826609600 -I0906 13:34:08.751155 8300 layer_factory.hpp:74] Creating layer relu7 -I0906 13:34:08.751186 8300 net.cpp:91] Creating Layer relu7 -I0906 13:34:08.751202 8300 net.cpp:411] relu7 <- fc7 -I0906 13:34:08.751229 8300 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:34:08.751243 8300 net.cpp:121] Setting up relu7 -I0906 13:34:08.751251 8300 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:08.751255 8300 net.cpp:134] Memory required for data: 828248000 -I0906 13:34:08.751260 8300 layer_factory.hpp:74] Creating layer fc8 -I0906 13:34:08.751281 8300 net.cpp:91] Creating Layer fc8 -I0906 13:34:08.751286 8300 net.cpp:411] fc8 <- fc7 -I0906 13:34:08.751301 8300 net.cpp:369] fc8 -> fc8 -I0906 13:34:08.751315 8300 net.cpp:121] Setting up fc8 -I0906 13:34:09.287158 8300 net.cpp:128] Top shape: 100 1000 (100000) -I0906 13:34:09.287181 8300 net.cpp:134] Memory required for data: 828648000 -I0906 13:34:09.287209 8300 layer_factory.hpp:74] Creating layer loss -I0906 13:34:09.287257 8300 net.cpp:91] Creating Layer loss -I0906 13:34:09.287272 8300 net.cpp:411] loss <- fc8 -I0906 13:34:09.287295 8300 net.cpp:411] loss <- label -I0906 13:34:09.287313 8300 net.cpp:369] loss -> loss -I0906 13:34:09.287333 8300 net.cpp:121] Setting up loss -I0906 13:34:09.287349 8300 layer_factory.hpp:74] Creating layer loss -I0906 13:34:09.287860 8300 net.cpp:128] Top shape: (1) -I0906 13:34:09.287865 8300 net.cpp:130] with loss weight 1 -I0906 13:34:09.287881 8300 net.cpp:134] Memory required for data: 828648004 -I0906 13:34:09.287890 8300 net.cpp:193] loss needs backward computation. -I0906 13:34:09.287899 8300 net.cpp:193] fc8 needs backward computation. -I0906 13:34:09.287904 8300 net.cpp:193] relu7 needs backward computation. -I0906 13:34:09.287910 8300 net.cpp:193] fc7 needs backward computation. -I0906 13:34:09.287916 8300 net.cpp:193] relu6 needs backward computation. -I0906 13:34:09.287921 8300 net.cpp:193] fc6 needs backward computation. -I0906 13:34:09.287935 8300 net.cpp:193] pool5 needs backward computation. -I0906 13:34:09.287940 8300 net.cpp:193] relu5 needs backward computation. -I0906 13:34:09.287946 8300 net.cpp:193] conv5 needs backward computation. -I0906 13:34:09.287952 8300 net.cpp:193] relu4 needs backward computation. -I0906 13:34:09.287958 8300 net.cpp:193] conv4 needs backward computation. -I0906 13:34:09.287964 8300 net.cpp:193] relu3 needs backward computation. -I0906 13:34:09.287969 8300 net.cpp:193] conv3 needs backward computation. -I0906 13:34:09.287977 8300 net.cpp:193] pool2 needs backward computation. -I0906 13:34:09.287983 8300 net.cpp:193] norm2 needs backward computation. -I0906 13:34:09.287989 8300 net.cpp:193] relu2 needs backward computation. -I0906 13:34:09.287996 8300 net.cpp:193] conv2 needs backward computation. -I0906 13:34:09.288002 8300 net.cpp:193] pool1 needs backward computation. -I0906 13:34:09.288007 8300 net.cpp:193] norm1 needs backward computation. -I0906 13:34:09.288014 8300 net.cpp:193] relu1 needs backward computation. -I0906 13:34:09.288019 8300 net.cpp:193] conv1 needs backward computation. -I0906 13:34:09.288028 8300 net.cpp:195] data does not need backward computation. -I0906 13:34:09.288034 8300 net.cpp:236] This network produces output loss -I0906 13:34:09.288067 8300 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:34:09.288084 8300 net.cpp:248] Network initialization done. -I0906 13:34:09.288087 8300 net.cpp:249] Memory required for data: 828648004 -I0906 13:34:09.289022 8300 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:34:09.289130 8300 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data -I0906 13:34:09.289348 8300 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TEST -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "fc8" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:34:09.289656 8300 net.cpp:68] Memory required for data: 0 -I0906 13:34:09.289702 8300 layer_factory.hpp:74] Creating layer data -I0906 13:34:09.289721 8300 net.cpp:91] Creating Layer data -I0906 13:34:09.289731 8300 net.cpp:369] data -> data -I0906 13:34:09.289752 8300 net.cpp:369] data -> label -I0906 13:34:09.289764 8300 net.cpp:121] Setting up data -I0906 13:34:09.289772 8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:34:09.298058 8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb -I0906 13:34:09.298318 8300 data_layer.cpp:53] output data size: 50,3,227,227 -I0906 13:34:09.314699 8300 base_data_layer.cpp:43] Initializing prefetch -I0906 13:34:09.314806 8300 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:34:09.314834 8300 net.cpp:128] Top shape: 50 3 227 227 (7729350) -I0906 13:34:09.314843 8300 net.cpp:128] Top shape: 50 (50) -I0906 13:34:09.314848 8300 net.cpp:134] Memory required for data: 30917600 -I0906 13:34:09.314882 8300 layer_factory.hpp:74] Creating layer label_data_1_split -I0906 13:34:09.314973 8300 net.cpp:91] Creating Layer label_data_1_split -I0906 13:34:09.314997 8300 net.cpp:411] label_data_1_split <- label -I0906 13:34:09.315035 8300 net.cpp:369] label_data_1_split -> label_data_1_split_0 -I0906 13:34:09.315073 8300 net.cpp:369] label_data_1_split -> label_data_1_split_1 -I0906 13:34:09.315085 8300 net.cpp:121] Setting up label_data_1_split -I0906 13:34:09.315116 8300 net.cpp:128] Top shape: 50 (50) -I0906 13:34:09.315124 8300 net.cpp:128] Top shape: 50 (50) -I0906 13:34:09.315127 8300 net.cpp:134] Memory required for data: 30918000 -I0906 13:34:09.315131 8300 layer_factory.hpp:74] Creating layer conv1 -I0906 13:34:09.315165 8300 net.cpp:91] Creating Layer conv1 -I0906 13:34:09.315171 8300 net.cpp:411] conv1 <- data -I0906 13:34:09.315183 8300 net.cpp:369] conv1 -> conv1 -I0906 13:34:09.315198 8300 net.cpp:121] Setting up conv1 -I0906 13:34:09.319859 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:09.319864 8300 net.cpp:134] Memory required for data: 88998000 -I0906 13:34:09.319883 8300 layer_factory.hpp:74] Creating layer relu1 -I0906 13:34:09.319895 8300 net.cpp:91] Creating Layer relu1 -I0906 13:34:09.319901 8300 net.cpp:411] relu1 <- conv1 -I0906 13:34:09.319913 8300 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:34:09.319926 8300 net.cpp:121] Setting up relu1 -I0906 13:34:09.319933 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:09.319937 8300 net.cpp:134] Memory required for data: 147078000 -I0906 13:34:09.319942 8300 layer_factory.hpp:74] Creating layer norm1 -I0906 13:34:09.319962 8300 net.cpp:91] Creating Layer norm1 -I0906 13:34:09.319968 8300 net.cpp:411] norm1 <- conv1 -I0906 13:34:09.319980 8300 net.cpp:369] norm1 -> norm1 -I0906 13:34:09.319991 8300 net.cpp:121] Setting up norm1 -I0906 13:34:09.320009 8300 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:09.320053 8300 net.cpp:134] Memory required for data: 205158000 -I0906 13:34:09.320060 8300 layer_factory.hpp:74] Creating layer pool1 -I0906 13:34:09.320075 8300 net.cpp:91] Creating Layer pool1 -I0906 13:34:09.320081 8300 net.cpp:411] pool1 <- norm1 -I0906 13:34:09.320093 8300 net.cpp:369] pool1 -> pool1 -I0906 13:34:09.320103 8300 net.cpp:121] Setting up pool1 -I0906 13:34:09.320122 8300 net.cpp:128] Top shape: 50 96 27 27 (3499200) -I0906 13:34:09.320125 8300 net.cpp:134] Memory required for data: 219154800 -I0906 13:34:09.320130 8300 layer_factory.hpp:74] Creating layer conv2 -I0906 13:34:09.320143 8300 net.cpp:91] Creating Layer conv2 -I0906 13:34:09.320149 8300 net.cpp:411] conv2 <- pool1 -I0906 13:34:09.320163 8300 net.cpp:369] conv2 -> conv2 -I0906 13:34:09.320174 8300 net.cpp:121] Setting up conv2 -I0906 13:34:09.359275 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:09.359290 8300 net.cpp:134] Memory required for data: 256479600 -I0906 13:34:09.359316 8300 layer_factory.hpp:74] Creating layer relu2 -I0906 13:34:09.359336 8300 net.cpp:91] Creating Layer relu2 -I0906 13:34:09.359346 8300 net.cpp:411] relu2 <- conv2 -I0906 13:34:09.359365 8300 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:34:09.359395 8300 net.cpp:121] Setting up relu2 -I0906 13:34:09.359403 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:09.359407 8300 net.cpp:134] Memory required for data: 293804400 -I0906 13:34:09.359412 8300 layer_factory.hpp:74] Creating layer norm2 -I0906 13:34:09.359433 8300 net.cpp:91] Creating Layer norm2 -I0906 13:34:09.359438 8300 net.cpp:411] norm2 <- conv2 -I0906 13:34:09.359452 8300 net.cpp:369] norm2 -> norm2 -I0906 13:34:09.359467 8300 net.cpp:121] Setting up norm2 -I0906 13:34:09.359486 8300 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:09.359490 8300 net.cpp:134] Memory required for data: 331129200 -I0906 13:34:09.359495 8300 layer_factory.hpp:74] Creating layer pool2 -I0906 13:34:09.359508 8300 net.cpp:91] Creating Layer pool2 -I0906 13:34:09.359514 8300 net.cpp:411] pool2 <- norm2 -I0906 13:34:09.359526 8300 net.cpp:369] pool2 -> pool2 -I0906 13:34:09.359537 8300 net.cpp:121] Setting up pool2 -I0906 13:34:09.359555 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:09.359558 8300 net.cpp:134] Memory required for data: 339782000 -I0906 13:34:09.359563 8300 layer_factory.hpp:74] Creating layer conv3 -I0906 13:34:09.359581 8300 net.cpp:91] Creating Layer conv3 -I0906 13:34:09.359587 8300 net.cpp:411] conv3 <- pool2 -I0906 13:34:09.359601 8300 net.cpp:369] conv3 -> conv3 -I0906 13:34:09.359613 8300 net.cpp:121] Setting up conv3 -I0906 13:34:09.410833 8305 data_layer.cpp:120] Prefetch batch: 95 ms. -I0906 13:34:09.410863 8305 data_layer.cpp:121] Read time: 11.984 ms. -I0906 13:34:09.410871 8305 data_layer.cpp:122] Transform time: 82.885 ms. -I0906 13:34:09.474556 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:09.474578 8300 net.cpp:134] Memory required for data: 352761200 -I0906 13:34:09.474618 8300 layer_factory.hpp:74] Creating layer relu3 -I0906 13:34:09.474648 8300 net.cpp:91] Creating Layer relu3 -I0906 13:34:09.474663 8300 net.cpp:411] relu3 <- conv3 -I0906 13:34:09.474689 8300 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:34:09.474704 8300 net.cpp:121] Setting up relu3 -I0906 13:34:09.474714 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:09.474717 8300 net.cpp:134] Memory required for data: 365740400 -I0906 13:34:09.474721 8300 layer_factory.hpp:74] Creating layer conv4 -I0906 13:34:09.474745 8300 net.cpp:91] Creating Layer conv4 -I0906 13:34:09.474751 8300 net.cpp:411] conv4 <- conv3 -I0906 13:34:09.474766 8300 net.cpp:369] conv4 -> conv4 -I0906 13:34:09.474781 8300 net.cpp:121] Setting up conv4 -I0906 13:34:09.562909 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:09.562930 8300 net.cpp:134] Memory required for data: 378719600 -I0906 13:34:09.562957 8300 layer_factory.hpp:74] Creating layer relu4 -I0906 13:34:09.562988 8300 net.cpp:91] Creating Layer relu4 -I0906 13:34:09.563051 8300 net.cpp:411] relu4 <- conv4 -I0906 13:34:09.563086 8300 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:34:09.563102 8300 net.cpp:121] Setting up relu4 -I0906 13:34:09.563112 8300 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:09.563117 8300 net.cpp:134] Memory required for data: 391698800 -I0906 13:34:09.563122 8300 layer_factory.hpp:74] Creating layer conv5 -I0906 13:34:09.563146 8300 net.cpp:91] Creating Layer conv5 -I0906 13:34:09.563153 8300 net.cpp:411] conv5 <- conv4 -I0906 13:34:09.563168 8300 net.cpp:369] conv5 -> conv5 -I0906 13:34:09.563182 8300 net.cpp:121] Setting up conv5 -I0906 13:34:09.619202 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:09.619220 8300 net.cpp:134] Memory required for data: 400351600 -I0906 13:34:09.619256 8300 layer_factory.hpp:74] Creating layer relu5 -I0906 13:34:09.619284 8300 net.cpp:91] Creating Layer relu5 -I0906 13:34:09.619298 8300 net.cpp:411] relu5 <- conv5 -I0906 13:34:09.619321 8300 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:34:09.619336 8300 net.cpp:121] Setting up relu5 -I0906 13:34:09.619344 8300 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:09.619349 8300 net.cpp:134] Memory required for data: 409004400 -I0906 13:34:09.619354 8300 layer_factory.hpp:74] Creating layer pool5 -I0906 13:34:09.619380 8300 net.cpp:91] Creating Layer pool5 -I0906 13:34:09.619386 8300 net.cpp:411] pool5 <- conv5 -I0906 13:34:09.619398 8300 net.cpp:369] pool5 -> pool5 -I0906 13:34:09.619411 8300 net.cpp:121] Setting up pool5 -I0906 13:34:09.619431 8300 net.cpp:128] Top shape: 50 256 6 6 (460800) -I0906 13:34:09.619434 8300 net.cpp:134] Memory required for data: 410847600 -I0906 13:34:09.619439 8300 layer_factory.hpp:74] Creating layer fc6 -I0906 13:34:09.619457 8300 net.cpp:91] Creating Layer fc6 -I0906 13:34:09.619463 8300 net.cpp:411] fc6 <- pool5 -I0906 13:34:09.619477 8300 net.cpp:369] fc6 -> fc6 -I0906 13:34:09.619488 8300 net.cpp:121] Setting up fc6 -I0906 13:34:15.320122 8300 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:15.320147 8300 net.cpp:134] Memory required for data: 411666800 -I0906 13:34:15.320174 8300 layer_factory.hpp:74] Creating layer relu6 -I0906 13:34:15.320206 8300 net.cpp:91] Creating Layer relu6 -I0906 13:34:15.320222 8300 net.cpp:411] relu6 <- fc6 -I0906 13:34:15.320248 8300 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:34:15.320263 8300 net.cpp:121] Setting up relu6 -I0906 13:34:15.320272 8300 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:15.320276 8300 net.cpp:134] Memory required for data: 412486000 -I0906 13:34:15.320281 8300 layer_factory.hpp:74] Creating layer fc7 -I0906 13:34:15.320302 8300 net.cpp:91] Creating Layer fc7 -I0906 13:34:15.320308 8300 net.cpp:411] fc7 <- fc6 -I0906 13:34:15.320322 8300 net.cpp:369] fc7 -> fc7 -I0906 13:34:15.320338 8300 net.cpp:121] Setting up fc7 -I0906 13:34:17.700968 8300 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:17.700994 8300 net.cpp:134] Memory required for data: 413305200 -I0906 13:34:17.701020 8300 layer_factory.hpp:74] Creating layer relu7 -I0906 13:34:17.701052 8300 net.cpp:91] Creating Layer relu7 -I0906 13:34:17.701067 8300 net.cpp:411] relu7 <- fc7 -I0906 13:34:17.701093 8300 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:34:17.701109 8300 net.cpp:121] Setting up relu7 -I0906 13:34:17.701117 8300 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:17.701122 8300 net.cpp:134] Memory required for data: 414124400 -I0906 13:34:17.701125 8300 layer_factory.hpp:74] Creating layer fc8 -I0906 13:34:17.701146 8300 net.cpp:91] Creating Layer fc8 -I0906 13:34:17.701153 8300 net.cpp:411] fc8 <- fc7 -I0906 13:34:17.701166 8300 net.cpp:369] fc8 -> fc8 -I0906 13:34:17.701191 8300 net.cpp:121] Setting up fc8 -I0906 13:34:18.224659 8300 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:18.224681 8300 net.cpp:134] Memory required for data: 414324400 -I0906 13:34:18.224707 8300 layer_factory.hpp:74] Creating layer fc8_fc8_0_split -I0906 13:34:18.224737 8300 net.cpp:91] Creating Layer fc8_fc8_0_split -I0906 13:34:18.224798 8300 net.cpp:411] fc8_fc8_0_split <- fc8 -I0906 13:34:18.224828 8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 -I0906 13:34:18.224848 8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 -I0906 13:34:18.224860 8300 net.cpp:121] Setting up fc8_fc8_0_split -I0906 13:34:18.224876 8300 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:18.224882 8300 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:18.224886 8300 net.cpp:134] Memory required for data: 414724400 -I0906 13:34:18.224891 8300 layer_factory.hpp:74] Creating layer accuracy -I0906 13:34:18.224922 8300 net.cpp:91] Creating Layer accuracy -I0906 13:34:18.224927 8300 net.cpp:411] accuracy <- fc8_fc8_0_split_0 -I0906 13:34:18.224938 8300 net.cpp:411] accuracy <- label_data_1_split_0 -I0906 13:34:18.224949 8300 net.cpp:369] accuracy -> accuracy -I0906 13:34:18.224961 8300 net.cpp:121] Setting up accuracy -I0906 13:34:18.224977 8300 net.cpp:128] Top shape: (1) -I0906 13:34:18.224980 8300 net.cpp:134] Memory required for data: 414724404 -I0906 13:34:18.224985 8300 layer_factory.hpp:74] Creating layer loss -I0906 13:34:18.224997 8300 net.cpp:91] Creating Layer loss -I0906 13:34:18.225003 8300 net.cpp:411] loss <- fc8_fc8_0_split_1 -I0906 13:34:18.225013 8300 net.cpp:411] loss <- label_data_1_split_1 -I0906 13:34:18.225023 8300 net.cpp:369] loss -> loss -I0906 13:34:18.225033 8300 net.cpp:121] Setting up loss -I0906 13:34:18.225044 8300 layer_factory.hpp:74] Creating layer loss -I0906 13:34:18.225343 8300 net.cpp:128] Top shape: (1) -I0906 13:34:18.225348 8300 net.cpp:130] with loss weight 1 -I0906 13:34:18.225364 8300 net.cpp:134] Memory required for data: 414724408 -I0906 13:34:18.225371 8300 net.cpp:193] loss needs backward computation. -I0906 13:34:18.225378 8300 net.cpp:195] accuracy does not need backward computation. -I0906 13:34:18.225386 8300 net.cpp:193] fc8_fc8_0_split needs backward computation. -I0906 13:34:18.225391 8300 net.cpp:193] fc8 needs backward computation. -I0906 13:34:18.225397 8300 net.cpp:193] relu7 needs backward computation. -I0906 13:34:18.225404 8300 net.cpp:193] fc7 needs backward computation. -I0906 13:34:18.225409 8300 net.cpp:193] relu6 needs backward computation. -I0906 13:34:18.225414 8300 net.cpp:193] fc6 needs backward computation. -I0906 13:34:18.225420 8300 net.cpp:193] pool5 needs backward computation. -I0906 13:34:18.225426 8300 net.cpp:193] relu5 needs backward computation. -I0906 13:34:18.225431 8300 net.cpp:193] conv5 needs backward computation. -I0906 13:34:18.225438 8300 net.cpp:193] relu4 needs backward computation. -I0906 13:34:18.225443 8300 net.cpp:193] conv4 needs backward computation. -I0906 13:34:18.225450 8300 net.cpp:193] relu3 needs backward computation. -I0906 13:34:18.225455 8300 net.cpp:193] conv3 needs backward computation. -I0906 13:34:18.225461 8300 net.cpp:193] pool2 needs backward computation. -I0906 13:34:18.225466 8300 net.cpp:193] norm2 needs backward computation. -I0906 13:34:18.225472 8300 net.cpp:193] relu2 needs backward computation. -I0906 13:34:18.225477 8300 net.cpp:193] conv2 needs backward computation. -I0906 13:34:18.225484 8300 net.cpp:193] pool1 needs backward computation. -I0906 13:34:18.225491 8300 net.cpp:193] norm1 needs backward computation. -I0906 13:34:18.225496 8300 net.cpp:193] relu1 needs backward computation. -I0906 13:34:18.225502 8300 net.cpp:193] conv1 needs backward computation. -I0906 13:34:18.225508 8300 net.cpp:195] label_data_1_split does not need backward computation. -I0906 13:34:18.225515 8300 net.cpp:195] data does not need backward computation. -I0906 13:34:18.225520 8300 net.cpp:236] This network produces output accuracy -I0906 13:34:18.225527 8300 net.cpp:236] This network produces output loss -I0906 13:34:18.225561 8300 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:34:18.225574 8300 net.cpp:248] Network initialization done. -I0906 13:34:18.225579 8300 net.cpp:249] Memory required for data: 414724408 -I0906 13:34:18.225764 8300 solver.cpp:53] Solver scaffolding done. -I0906 13:34:18.225879 8300 solver.cpp:270] Solving AlexNet -I0906 13:34:18.225898 8300 solver.cpp:271] Learning Rate Policy: step -I0906 13:34:18.227551 8300 solver.cpp:314] Iteration 0, Testing net (#0) -I0906 13:34:18.227571 8300 net.cpp:696] Copying source layer data -I0906 13:34:18.227577 8300 net.cpp:696] Copying source layer conv1 -I0906 13:34:18.230358 8300 net.cpp:696] Copying source layer relu1 -I0906 13:34:18.230398 8300 net.cpp:696] Copying source layer norm1 -I0906 13:34:18.230409 8300 net.cpp:696] Copying source layer pool1 -I0906 13:34:18.230419 8300 net.cpp:696] Copying source layer conv2 -I0906 13:34:18.230605 8300 net.cpp:696] Copying source layer relu2 -I0906 13:34:18.230624 8300 net.cpp:696] Copying source layer norm2 -I0906 13:34:18.230634 8300 net.cpp:696] Copying source layer pool2 -I0906 13:34:18.230644 8300 net.cpp:696] Copying source layer conv3 -I0906 13:34:18.231482 8300 net.cpp:696] Copying source layer relu3 -I0906 13:34:18.231510 8300 net.cpp:696] Copying source layer conv4 -I0906 13:34:18.232178 8300 net.cpp:696] Copying source layer relu4 -I0906 13:34:18.232195 8300 net.cpp:696] Copying source layer conv5 -I0906 13:34:18.232681 8300 net.cpp:696] Copying source layer relu5 -I0906 13:34:18.232697 8300 net.cpp:696] Copying source layer pool5 -I0906 13:34:18.232708 8300 net.cpp:696] Copying source layer fc6 -I0906 13:34:18.250728 8300 net.cpp:696] Copying source layer relu6 -I0906 13:34:18.250753 8300 net.cpp:696] Copying source layer fc7 -I0906 13:34:18.257216 8300 net.cpp:696] Copying source layer relu7 -I0906 13:34:18.257241 8300 net.cpp:696] Copying source layer fc8 -I0906 13:34:18.258977 8300 net.cpp:696] Copying source layer loss -I0906 13:34:18.259091 8300 base_data_layer.cpp:89] Thread joined -I0906 13:34:18.263509 8300 base_data_layer.cpp:93] Prefetch copied -I0906 13:34:18.263875 8300 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:34:18.362475 8306 data_layer.cpp:120] Prefetch batch: 98 ms. -I0906 13:34:18.362507 8306 data_layer.cpp:121] Read time: 12.694 ms. -I0906 13:34:18.362515 8306 data_layer.cpp:122] Transform time: 84.611 ms. -I0906 13:34:21.291707 8300 solver.cpp:363] Test net output #0: accuracy = 0 -I0906 13:34:21.291733 8300 solver.cpp:363] Test net output #1: loss = 6.91228 (* 1 = 6.91228 loss) -I0906 13:34:21.291775 8300 base_data_layer.cpp:89] Thread joined -I0906 13:34:21.300678 8300 base_data_layer.cpp:93] Prefetch copied -I0906 13:34:21.301050 8300 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:34:21.491194 8310 data_layer.cpp:120] Prefetch batch: 189 ms. -I0906 13:34:21.491225 8310 data_layer.cpp:121] Read time: 24.533 ms. -I0906 13:34:21.491231 8310 data_layer.cpp:122] Transform time: 163.65 ms. -I0906 13:34:28.088075 8300 solver.cpp:234] Iteration 0, loss = 0 -I0906 13:34:28.088134 8300 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) -I0906 13:34:28.088184 8300 solver.cpp:506] Iteration 0, lr = 0.01 -I0906 13:34:28.203598 8300 base_data_layer.cpp:89] Thread joined -I0906 13:34:28.212023 8300 base_data_layer.cpp:93] Prefetch copied -I0906 13:34:28.212162 8300 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:34:28.397155 8312 data_layer.cpp:120] Prefetch batch: 184 ms. -I0906 13:34:28.397193 8312 data_layer.cpp:121] Read time: 23.16 ms. -I0906 13:34:28.397200 8312 data_layer.cpp:122] Transform time: 159.902 ms. -I0906 13:34:30.978493 8300 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 deleted file mode 100644 index 93afd4cf..00000000 --- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 +++ /dev/null @@ -1,1208 +0,0 @@ -Log file created at: 2015/09/06 13:34:37 -Running on machine: AMD-RESEARCH -Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -I0906 13:34:37.585557 8316 caffe.cpp:114] Use GPU with device ID 0 -I0906 13:34:37.621670 8316 device.cpp:230] Number of platforms found:1 -I0906 13:34:37.621708 8316 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing -I0906 13:34:37.621721 8316 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE -I0906 13:34:37.621724 8316 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) -I0906 13:34:37.621728 8316 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. -I0906 13:34:37.621732 8316 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices -I0906 13:34:37.621739 8316 device.cpp:286] Number of devices found:1 -I0906 13:34:37.621743 8316 device.cpp:288] DeviceID: 0x22ed250 -I0906 13:34:37.621760 8316 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU -I0906 13:34:37.621767 8316 device.cpp:393] Is it integrated GPU?: 0 -I0906 13:34:37.621772 8316 device.cpp:393] Max clock frequency MHz: 930 -I0906 13:34:37.621775 8316 device.cpp:393] Host-Device unified mem: 0 -I0906 13:34:37.621779 8316 device.cpp:393] ECC support: 0 -I0906 13:34:37.621783 8316 device.cpp:393] Endian little: 1 -I0906 13:34:37.621788 8316 device.cpp:393] Max compute units: 44 -I0906 13:34:37.621791 8316 device.cpp:393] Max work group size: 256 -I0906 13:34:37.621796 8316 device.cpp:393] Max work item dimensions: 3 -I0906 13:34:37.621801 8316 device.cpp:393] Max work item sizes: 0x100 -I0906 13:34:37.621806 8316 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE -I0906 13:34:37.621811 8316 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL -I0906 13:34:37.621815 8316 device.cpp:393] Max mem alloc size: 4244635648 -I0906 13:34:37.621819 8316 device.cpp:393] Global mem size: 16878927872 -I0906 13:34:37.621822 8316 device.cpp:393] Local mem size: 32768 -I0906 13:34:37.621830 8316 device.cpp:96] Picked device type : GPU 0 -I0906 13:34:40.036291 8316 device.cpp:152] Build Program -I0906 13:34:40.036520 8316 caffe.cpp:122] Starting Optimization -I0906 13:34:40.036612 8316 solver.cpp:40] Initializing solver from parameters: -test_iter: 1 -test_interval: 1000 -base_lr: 0.01 -display: 1 -max_iter: 10 -lr_policy: "step" -gamma: 0.1 -momentum: 0.9 -weight_decay: 0.0005 -stepsize: 100000 -snapshot: 10000 -snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" -solver_mode: GPU -net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" -I0906 13:34:40.036731 8316 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:34:40.037874 8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data -I0906 13:34:40.037925 8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy -I0906 13:34:40.038099 8316 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TRAIN -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:34:40.038537 8316 net.cpp:68] Memory required for data: 0 -I0906 13:34:40.038749 8316 layer_factory.hpp:74] Creating layer data -I0906 13:34:40.038802 8316 net.cpp:91] Creating Layer data -I0906 13:34:40.038825 8316 net.cpp:369] data -> data -I0906 13:34:40.038928 8316 net.cpp:369] data -> label -I0906 13:34:40.038950 8316 net.cpp:121] Setting up data -I0906 13:34:40.038962 8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:34:40.048738 8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb -I0906 13:34:40.049080 8316 data_layer.cpp:53] output data size: 100,3,227,227 -I0906 13:34:40.081225 8316 base_data_layer.cpp:43] Initializing prefetch -I0906 13:34:40.081426 8316 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:34:40.081490 8316 net.cpp:128] Top shape: 100 3 227 227 (15458700) -I0906 13:34:40.081500 8316 net.cpp:128] Top shape: 100 (100) -I0906 13:34:40.081504 8316 net.cpp:134] Memory required for data: 61835200 -I0906 13:34:40.081537 8316 layer_factory.hpp:74] Creating layer conv1 -I0906 13:34:40.081619 8316 net.cpp:91] Creating Layer conv1 -I0906 13:34:40.081641 8316 net.cpp:411] conv1 <- data -I0906 13:34:40.081694 8316 net.cpp:369] conv1 -> conv1 -I0906 13:34:40.081758 8316 net.cpp:121] Setting up conv1 -I0906 13:34:40.088135 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:40.088160 8316 net.cpp:134] Memory required for data: 177995200 -I0906 13:34:40.088239 8316 layer_factory.hpp:74] Creating layer relu1 -I0906 13:34:40.088297 8316 net.cpp:91] Creating Layer relu1 -I0906 13:34:40.088315 8316 net.cpp:411] relu1 <- conv1 -I0906 13:34:40.088351 8316 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:34:40.088372 8316 net.cpp:121] Setting up relu1 -I0906 13:34:40.088385 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:40.088390 8316 net.cpp:134] Memory required for data: 294155200 -I0906 13:34:40.088397 8316 layer_factory.hpp:74] Creating layer norm1 -I0906 13:34:40.088435 8316 net.cpp:91] Creating Layer norm1 -I0906 13:34:40.088444 8316 net.cpp:411] norm1 <- conv1 -I0906 13:34:40.088466 8316 net.cpp:369] norm1 -> norm1 -I0906 13:34:40.088486 8316 net.cpp:121] Setting up norm1 -I0906 13:34:40.088531 8316 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:34:40.088537 8316 net.cpp:134] Memory required for data: 410315200 -I0906 13:34:40.088543 8316 layer_factory.hpp:74] Creating layer pool1 -I0906 13:34:40.088580 8316 net.cpp:91] Creating Layer pool1 -I0906 13:34:40.088590 8316 net.cpp:411] pool1 <- norm1 -I0906 13:34:40.088613 8316 net.cpp:369] pool1 -> pool1 -I0906 13:34:40.088637 8316 net.cpp:121] Setting up pool1 -I0906 13:34:40.088686 8316 net.cpp:128] Top shape: 100 96 27 27 (6998400) -I0906 13:34:40.088691 8316 net.cpp:134] Memory required for data: 438308800 -I0906 13:34:40.088701 8316 layer_factory.hpp:74] Creating layer conv2 -I0906 13:34:40.088739 8316 net.cpp:91] Creating Layer conv2 -I0906 13:34:40.088750 8316 net.cpp:411] conv2 <- pool1 -I0906 13:34:40.088783 8316 net.cpp:369] conv2 -> conv2 -I0906 13:34:40.088804 8316 net.cpp:121] Setting up conv2 -I0906 13:34:40.129534 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:40.129550 8316 net.cpp:134] Memory required for data: 512958400 -I0906 13:34:40.129585 8316 layer_factory.hpp:74] Creating layer relu2 -I0906 13:34:40.129613 8316 net.cpp:91] Creating Layer relu2 -I0906 13:34:40.129624 8316 net.cpp:411] relu2 <- conv2 -I0906 13:34:40.129647 8316 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:34:40.129662 8316 net.cpp:121] Setting up relu2 -I0906 13:34:40.129670 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:40.129674 8316 net.cpp:134] Memory required for data: 587608000 -I0906 13:34:40.129679 8316 layer_factory.hpp:74] Creating layer norm2 -I0906 13:34:40.129698 8316 net.cpp:91] Creating Layer norm2 -I0906 13:34:40.129703 8316 net.cpp:411] norm2 <- conv2 -I0906 13:34:40.129717 8316 net.cpp:369] norm2 -> norm2 -I0906 13:34:40.129730 8316 net.cpp:121] Setting up norm2 -I0906 13:34:40.129750 8316 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:34:40.129755 8316 net.cpp:134] Memory required for data: 662257600 -I0906 13:34:40.129760 8316 layer_factory.hpp:74] Creating layer pool2 -I0906 13:34:40.129783 8316 net.cpp:91] Creating Layer pool2 -I0906 13:34:40.129789 8316 net.cpp:411] pool2 <- norm2 -I0906 13:34:40.129802 8316 net.cpp:369] pool2 -> pool2 -I0906 13:34:40.129813 8316 net.cpp:121] Setting up pool2 -I0906 13:34:40.129832 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:40.129837 8316 net.cpp:134] Memory required for data: 679563200 -I0906 13:34:40.129887 8316 layer_factory.hpp:74] Creating layer conv3 -I0906 13:34:40.129910 8316 net.cpp:91] Creating Layer conv3 -I0906 13:34:40.129916 8316 net.cpp:411] conv3 <- pool2 -I0906 13:34:40.129933 8316 net.cpp:369] conv3 -> conv3 -I0906 13:34:40.129948 8316 net.cpp:121] Setting up conv3 -I0906 13:34:40.246141 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:40.246165 8316 net.cpp:134] Memory required for data: 705521600 -I0906 13:34:40.246211 8316 layer_factory.hpp:74] Creating layer relu3 -I0906 13:34:40.246247 8316 net.cpp:91] Creating Layer relu3 -I0906 13:34:40.246261 8316 net.cpp:411] relu3 <- conv3 -I0906 13:34:40.246287 8316 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:34:40.246304 8316 net.cpp:121] Setting up relu3 -I0906 13:34:40.246314 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:40.246317 8316 net.cpp:134] Memory required for data: 731480000 -I0906 13:34:40.246322 8316 layer_factory.hpp:74] Creating layer conv4 -I0906 13:34:40.246351 8316 net.cpp:91] Creating Layer conv4 -I0906 13:34:40.246356 8316 net.cpp:411] conv4 <- conv3 -I0906 13:34:40.246372 8316 net.cpp:369] conv4 -> conv4 -I0906 13:34:40.246387 8316 net.cpp:121] Setting up conv4 -I0906 13:34:40.273671 8320 data_layer.cpp:120] Prefetch batch: 191 ms. -I0906 13:34:40.273718 8320 data_layer.cpp:121] Read time: 24.494 ms. -I0906 13:34:40.273727 8320 data_layer.cpp:122] Transform time: 165.29 ms. -I0906 13:34:40.332166 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:40.332187 8316 net.cpp:134] Memory required for data: 757438400 -I0906 13:34:40.332214 8316 layer_factory.hpp:74] Creating layer relu4 -I0906 13:34:40.332247 8316 net.cpp:91] Creating Layer relu4 -I0906 13:34:40.332262 8316 net.cpp:411] relu4 <- conv4 -I0906 13:34:40.332288 8316 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:34:40.332304 8316 net.cpp:121] Setting up relu4 -I0906 13:34:40.332314 8316 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:34:40.332317 8316 net.cpp:134] Memory required for data: 783396800 -I0906 13:34:40.332321 8316 layer_factory.hpp:74] Creating layer conv5 -I0906 13:34:40.332350 8316 net.cpp:91] Creating Layer conv5 -I0906 13:34:40.332355 8316 net.cpp:411] conv5 <- conv4 -I0906 13:34:40.332371 8316 net.cpp:369] conv5 -> conv5 -I0906 13:34:40.332386 8316 net.cpp:121] Setting up conv5 -I0906 13:34:40.388872 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:40.388891 8316 net.cpp:134] Memory required for data: 800702400 -I0906 13:34:40.388931 8316 layer_factory.hpp:74] Creating layer relu5 -I0906 13:34:40.388959 8316 net.cpp:91] Creating Layer relu5 -I0906 13:34:40.388972 8316 net.cpp:411] relu5 <- conv5 -I0906 13:34:40.388995 8316 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:34:40.389010 8316 net.cpp:121] Setting up relu5 -I0906 13:34:40.389019 8316 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:34:40.389024 8316 net.cpp:134] Memory required for data: 818008000 -I0906 13:34:40.389029 8316 layer_factory.hpp:74] Creating layer pool5 -I0906 13:34:40.389049 8316 net.cpp:91] Creating Layer pool5 -I0906 13:34:40.389053 8316 net.cpp:411] pool5 <- conv5 -I0906 13:34:40.389067 8316 net.cpp:369] pool5 -> pool5 -I0906 13:34:40.389081 8316 net.cpp:121] Setting up pool5 -I0906 13:34:40.389102 8316 net.cpp:128] Top shape: 100 256 6 6 (921600) -I0906 13:34:40.389107 8316 net.cpp:134] Memory required for data: 821694400 -I0906 13:34:40.389112 8316 layer_factory.hpp:74] Creating layer fc6 -I0906 13:34:40.389147 8316 net.cpp:91] Creating Layer fc6 -I0906 13:34:40.389153 8316 net.cpp:411] fc6 <- pool5 -I0906 13:34:40.389169 8316 net.cpp:369] fc6 -> fc6 -I0906 13:34:40.389183 8316 net.cpp:121] Setting up fc6 -I0906 13:34:45.208031 8316 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:45.208055 8316 net.cpp:134] Memory required for data: 823332800 -I0906 13:34:45.208081 8316 layer_factory.hpp:74] Creating layer relu6 -I0906 13:34:45.208112 8316 net.cpp:91] Creating Layer relu6 -I0906 13:34:45.208128 8316 net.cpp:411] relu6 <- fc6 -I0906 13:34:45.208154 8316 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:34:45.208210 8316 net.cpp:121] Setting up relu6 -I0906 13:34:45.208220 8316 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:45.208223 8316 net.cpp:134] Memory required for data: 824971200 -I0906 13:34:45.208228 8316 layer_factory.hpp:74] Creating layer fc7 -I0906 13:34:45.208250 8316 net.cpp:91] Creating Layer fc7 -I0906 13:34:45.208256 8316 net.cpp:411] fc7 <- fc6 -I0906 13:34:45.208273 8316 net.cpp:369] fc7 -> fc7 -I0906 13:34:45.208288 8316 net.cpp:121] Setting up fc7 -I0906 13:34:47.352208 8316 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:47.352234 8316 net.cpp:134] Memory required for data: 826609600 -I0906 13:34:47.352262 8316 layer_factory.hpp:74] Creating layer relu7 -I0906 13:34:47.352295 8316 net.cpp:91] Creating Layer relu7 -I0906 13:34:47.352311 8316 net.cpp:411] relu7 <- fc7 -I0906 13:34:47.352339 8316 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:34:47.352355 8316 net.cpp:121] Setting up relu7 -I0906 13:34:47.352363 8316 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:34:47.352368 8316 net.cpp:134] Memory required for data: 828248000 -I0906 13:34:47.352373 8316 layer_factory.hpp:74] Creating layer fc8 -I0906 13:34:47.352396 8316 net.cpp:91] Creating Layer fc8 -I0906 13:34:47.352402 8316 net.cpp:411] fc8 <- fc7 -I0906 13:34:47.352418 8316 net.cpp:369] fc8 -> fc8 -I0906 13:34:47.352433 8316 net.cpp:121] Setting up fc8 -I0906 13:34:47.878074 8316 net.cpp:128] Top shape: 100 1000 (100000) -I0906 13:34:47.878098 8316 net.cpp:134] Memory required for data: 828648000 -I0906 13:34:47.878126 8316 layer_factory.hpp:74] Creating layer loss -I0906 13:34:47.878178 8316 net.cpp:91] Creating Layer loss -I0906 13:34:47.878195 8316 net.cpp:411] loss <- fc8 -I0906 13:34:47.878217 8316 net.cpp:411] loss <- label -I0906 13:34:47.878237 8316 net.cpp:369] loss -> loss -I0906 13:34:47.878255 8316 net.cpp:121] Setting up loss -I0906 13:34:47.878273 8316 layer_factory.hpp:74] Creating layer loss -I0906 13:34:47.878825 8316 net.cpp:128] Top shape: (1) -I0906 13:34:47.878831 8316 net.cpp:130] with loss weight 1 -I0906 13:34:47.878847 8316 net.cpp:134] Memory required for data: 828648004 -I0906 13:34:47.878856 8316 net.cpp:193] loss needs backward computation. -I0906 13:34:47.878865 8316 net.cpp:193] fc8 needs backward computation. -I0906 13:34:47.878870 8316 net.cpp:193] relu7 needs backward computation. -I0906 13:34:47.878876 8316 net.cpp:193] fc7 needs backward computation. -I0906 13:34:47.878882 8316 net.cpp:193] relu6 needs backward computation. -I0906 13:34:47.878888 8316 net.cpp:193] fc6 needs backward computation. -I0906 13:34:47.878895 8316 net.cpp:193] pool5 needs backward computation. -I0906 13:34:47.878901 8316 net.cpp:193] relu5 needs backward computation. -I0906 13:34:47.878906 8316 net.cpp:193] conv5 needs backward computation. -I0906 13:34:47.878911 8316 net.cpp:193] relu4 needs backward computation. -I0906 13:34:47.878917 8316 net.cpp:193] conv4 needs backward computation. -I0906 13:34:47.878923 8316 net.cpp:193] relu3 needs backward computation. -I0906 13:34:47.878928 8316 net.cpp:193] conv3 needs backward computation. -I0906 13:34:47.878936 8316 net.cpp:193] pool2 needs backward computation. -I0906 13:34:47.878942 8316 net.cpp:193] norm2 needs backward computation. -I0906 13:34:47.878948 8316 net.cpp:193] relu2 needs backward computation. -I0906 13:34:47.878953 8316 net.cpp:193] conv2 needs backward computation. -I0906 13:34:47.878959 8316 net.cpp:193] pool1 needs backward computation. -I0906 13:34:47.878965 8316 net.cpp:193] norm1 needs backward computation. -I0906 13:34:47.878972 8316 net.cpp:193] relu1 needs backward computation. -I0906 13:34:47.878978 8316 net.cpp:193] conv1 needs backward computation. -I0906 13:34:47.878984 8316 net.cpp:195] data does not need backward computation. -I0906 13:34:47.878993 8316 net.cpp:236] This network produces output loss -I0906 13:34:47.879026 8316 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:34:47.879042 8316 net.cpp:248] Network initialization done. -I0906 13:34:47.879045 8316 net.cpp:249] Memory required for data: 828648004 -I0906 13:34:47.880003 8316 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:34:47.880131 8316 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data -I0906 13:34:47.880362 8316 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TEST -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "fc8" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:34:47.880718 8316 net.cpp:68] Memory required for data: 0 -I0906 13:34:47.880764 8316 layer_factory.hpp:74] Creating layer data -I0906 13:34:47.880786 8316 net.cpp:91] Creating Layer data -I0906 13:34:47.880797 8316 net.cpp:369] data -> data -I0906 13:34:47.880820 8316 net.cpp:369] data -> label -I0906 13:34:47.880832 8316 net.cpp:121] Setting up data -I0906 13:34:47.880839 8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:34:47.890487 8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb -I0906 13:34:47.890738 8316 data_layer.cpp:53] output data size: 50,3,227,227 -I0906 13:34:47.907624 8316 base_data_layer.cpp:43] Initializing prefetch -I0906 13:34:47.907733 8316 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:34:47.907762 8316 net.cpp:128] Top shape: 50 3 227 227 (7729350) -I0906 13:34:47.907769 8316 net.cpp:128] Top shape: 50 (50) -I0906 13:34:47.907773 8316 net.cpp:134] Memory required for data: 30917600 -I0906 13:34:47.907805 8316 layer_factory.hpp:74] Creating layer label_data_1_split -I0906 13:34:47.907896 8316 net.cpp:91] Creating Layer label_data_1_split -I0906 13:34:47.907917 8316 net.cpp:411] label_data_1_split <- label -I0906 13:34:47.907979 8316 net.cpp:369] label_data_1_split -> label_data_1_split_0 -I0906 13:34:47.908016 8316 net.cpp:369] label_data_1_split -> label_data_1_split_1 -I0906 13:34:47.908028 8316 net.cpp:121] Setting up label_data_1_split -I0906 13:34:47.908057 8316 net.cpp:128] Top shape: 50 (50) -I0906 13:34:47.908064 8316 net.cpp:128] Top shape: 50 (50) -I0906 13:34:47.908068 8316 net.cpp:134] Memory required for data: 30918000 -I0906 13:34:47.908073 8316 layer_factory.hpp:74] Creating layer conv1 -I0906 13:34:47.908112 8316 net.cpp:91] Creating Layer conv1 -I0906 13:34:47.908118 8316 net.cpp:411] conv1 <- data -I0906 13:34:47.908133 8316 net.cpp:369] conv1 -> conv1 -I0906 13:34:47.908148 8316 net.cpp:121] Setting up conv1 -I0906 13:34:47.912806 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:47.912811 8316 net.cpp:134] Memory required for data: 88998000 -I0906 13:34:47.912832 8316 layer_factory.hpp:74] Creating layer relu1 -I0906 13:34:47.912844 8316 net.cpp:91] Creating Layer relu1 -I0906 13:34:47.912850 8316 net.cpp:411] relu1 <- conv1 -I0906 13:34:47.912863 8316 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:34:47.912873 8316 net.cpp:121] Setting up relu1 -I0906 13:34:47.912880 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:47.912883 8316 net.cpp:134] Memory required for data: 147078000 -I0906 13:34:47.912889 8316 layer_factory.hpp:74] Creating layer norm1 -I0906 13:34:47.912907 8316 net.cpp:91] Creating Layer norm1 -I0906 13:34:47.912912 8316 net.cpp:411] norm1 <- conv1 -I0906 13:34:47.912925 8316 net.cpp:369] norm1 -> norm1 -I0906 13:34:47.912936 8316 net.cpp:121] Setting up norm1 -I0906 13:34:47.912955 8316 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:34:47.912999 8316 net.cpp:134] Memory required for data: 205158000 -I0906 13:34:47.913004 8316 layer_factory.hpp:74] Creating layer pool1 -I0906 13:34:47.913022 8316 net.cpp:91] Creating Layer pool1 -I0906 13:34:47.913027 8316 net.cpp:411] pool1 <- norm1 -I0906 13:34:47.913040 8316 net.cpp:369] pool1 -> pool1 -I0906 13:34:47.913050 8316 net.cpp:121] Setting up pool1 -I0906 13:34:47.913069 8316 net.cpp:128] Top shape: 50 96 27 27 (3499200) -I0906 13:34:47.913074 8316 net.cpp:134] Memory required for data: 219154800 -I0906 13:34:47.913079 8316 layer_factory.hpp:74] Creating layer conv2 -I0906 13:34:47.913091 8316 net.cpp:91] Creating Layer conv2 -I0906 13:34:47.913096 8316 net.cpp:411] conv2 <- pool1 -I0906 13:34:47.913111 8316 net.cpp:369] conv2 -> conv2 -I0906 13:34:47.913123 8316 net.cpp:121] Setting up conv2 -I0906 13:34:47.952414 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:47.952428 8316 net.cpp:134] Memory required for data: 256479600 -I0906 13:34:47.952455 8316 layer_factory.hpp:74] Creating layer relu2 -I0906 13:34:47.952477 8316 net.cpp:91] Creating Layer relu2 -I0906 13:34:47.952487 8316 net.cpp:411] relu2 <- conv2 -I0906 13:34:47.952507 8316 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:34:47.952518 8316 net.cpp:121] Setting up relu2 -I0906 13:34:47.952527 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:47.952532 8316 net.cpp:134] Memory required for data: 293804400 -I0906 13:34:47.952536 8316 layer_factory.hpp:74] Creating layer norm2 -I0906 13:34:47.952558 8316 net.cpp:91] Creating Layer norm2 -I0906 13:34:47.952564 8316 net.cpp:411] norm2 <- conv2 -I0906 13:34:47.952577 8316 net.cpp:369] norm2 -> norm2 -I0906 13:34:47.952591 8316 net.cpp:121] Setting up norm2 -I0906 13:34:47.952610 8316 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:34:47.952615 8316 net.cpp:134] Memory required for data: 331129200 -I0906 13:34:47.952620 8316 layer_factory.hpp:74] Creating layer pool2 -I0906 13:34:47.952635 8316 net.cpp:91] Creating Layer pool2 -I0906 13:34:47.952641 8316 net.cpp:411] pool2 <- norm2 -I0906 13:34:47.952653 8316 net.cpp:369] pool2 -> pool2 -I0906 13:34:47.952663 8316 net.cpp:121] Setting up pool2 -I0906 13:34:47.952682 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:47.952685 8316 net.cpp:134] Memory required for data: 339782000 -I0906 13:34:47.952690 8316 layer_factory.hpp:74] Creating layer conv3 -I0906 13:34:47.952713 8316 net.cpp:91] Creating Layer conv3 -I0906 13:34:47.952718 8316 net.cpp:411] conv3 <- pool2 -I0906 13:34:47.952733 8316 net.cpp:369] conv3 -> conv3 -I0906 13:34:47.952744 8316 net.cpp:121] Setting up conv3 -I0906 13:34:48.002686 8321 data_layer.cpp:120] Prefetch batch: 94 ms. -I0906 13:34:48.002718 8321 data_layer.cpp:121] Read time: 12.003 ms. -I0906 13:34:48.002725 8321 data_layer.cpp:122] Transform time: 81.802 ms. -I0906 13:34:48.066742 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:48.066764 8316 net.cpp:134] Memory required for data: 352761200 -I0906 13:34:48.066805 8316 layer_factory.hpp:74] Creating layer relu3 -I0906 13:34:48.066839 8316 net.cpp:91] Creating Layer relu3 -I0906 13:34:48.066854 8316 net.cpp:411] relu3 <- conv3 -I0906 13:34:48.066880 8316 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:34:48.066897 8316 net.cpp:121] Setting up relu3 -I0906 13:34:48.066906 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:48.066910 8316 net.cpp:134] Memory required for data: 365740400 -I0906 13:34:48.066915 8316 layer_factory.hpp:74] Creating layer conv4 -I0906 13:34:48.066942 8316 net.cpp:91] Creating Layer conv4 -I0906 13:34:48.066947 8316 net.cpp:411] conv4 <- conv3 -I0906 13:34:48.066964 8316 net.cpp:369] conv4 -> conv4 -I0906 13:34:48.066979 8316 net.cpp:121] Setting up conv4 -I0906 13:34:48.151291 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:48.151312 8316 net.cpp:134] Memory required for data: 378719600 -I0906 13:34:48.151340 8316 layer_factory.hpp:74] Creating layer relu4 -I0906 13:34:48.151372 8316 net.cpp:91] Creating Layer relu4 -I0906 13:34:48.151430 8316 net.cpp:411] relu4 <- conv4 -I0906 13:34:48.151458 8316 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:34:48.151473 8316 net.cpp:121] Setting up relu4 -I0906 13:34:48.151482 8316 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:34:48.151486 8316 net.cpp:134] Memory required for data: 391698800 -I0906 13:34:48.151491 8316 layer_factory.hpp:74] Creating layer conv5 -I0906 13:34:48.151517 8316 net.cpp:91] Creating Layer conv5 -I0906 13:34:48.151523 8316 net.cpp:411] conv5 <- conv4 -I0906 13:34:48.151540 8316 net.cpp:369] conv5 -> conv5 -I0906 13:34:48.151554 8316 net.cpp:121] Setting up conv5 -I0906 13:34:48.208228 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:48.208250 8316 net.cpp:134] Memory required for data: 400351600 -I0906 13:34:48.208292 8316 layer_factory.hpp:74] Creating layer relu5 -I0906 13:34:48.208322 8316 net.cpp:91] Creating Layer relu5 -I0906 13:34:48.208336 8316 net.cpp:411] relu5 <- conv5 -I0906 13:34:48.208360 8316 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:34:48.208376 8316 net.cpp:121] Setting up relu5 -I0906 13:34:48.208385 8316 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:34:48.208389 8316 net.cpp:134] Memory required for data: 409004400 -I0906 13:34:48.208395 8316 layer_factory.hpp:74] Creating layer pool5 -I0906 13:34:48.208425 8316 net.cpp:91] Creating Layer pool5 -I0906 13:34:48.208431 8316 net.cpp:411] pool5 <- conv5 -I0906 13:34:48.208446 8316 net.cpp:369] pool5 -> pool5 -I0906 13:34:48.208459 8316 net.cpp:121] Setting up pool5 -I0906 13:34:48.208479 8316 net.cpp:128] Top shape: 50 256 6 6 (460800) -I0906 13:34:48.208483 8316 net.cpp:134] Memory required for data: 410847600 -I0906 13:34:48.208488 8316 layer_factory.hpp:74] Creating layer fc6 -I0906 13:34:48.208510 8316 net.cpp:91] Creating Layer fc6 -I0906 13:34:48.208516 8316 net.cpp:411] fc6 <- pool5 -I0906 13:34:48.208530 8316 net.cpp:369] fc6 -> fc6 -I0906 13:34:48.208544 8316 net.cpp:121] Setting up fc6 -I0906 13:34:52.951850 8316 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:52.951876 8316 net.cpp:134] Memory required for data: 411666800 -I0906 13:34:52.951903 8316 layer_factory.hpp:74] Creating layer relu6 -I0906 13:34:52.951944 8316 net.cpp:91] Creating Layer relu6 -I0906 13:34:52.951961 8316 net.cpp:411] relu6 <- fc6 -I0906 13:34:52.951987 8316 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:34:52.952003 8316 net.cpp:121] Setting up relu6 -I0906 13:34:52.952010 8316 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:52.952014 8316 net.cpp:134] Memory required for data: 412486000 -I0906 13:34:52.952019 8316 layer_factory.hpp:74] Creating layer fc7 -I0906 13:34:52.952044 8316 net.cpp:91] Creating Layer fc7 -I0906 13:34:52.952049 8316 net.cpp:411] fc7 <- fc6 -I0906 13:34:52.952065 8316 net.cpp:369] fc7 -> fc7 -I0906 13:34:52.952080 8316 net.cpp:121] Setting up fc7 -I0906 13:34:55.059911 8316 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:55.059948 8316 net.cpp:134] Memory required for data: 413305200 -I0906 13:34:55.059976 8316 layer_factory.hpp:74] Creating layer relu7 -I0906 13:34:55.060010 8316 net.cpp:91] Creating Layer relu7 -I0906 13:34:55.060025 8316 net.cpp:411] relu7 <- fc7 -I0906 13:34:55.060053 8316 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:34:55.060070 8316 net.cpp:121] Setting up relu7 -I0906 13:34:55.060078 8316 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:34:55.060082 8316 net.cpp:134] Memory required for data: 414124400 -I0906 13:34:55.060087 8316 layer_factory.hpp:74] Creating layer fc8 -I0906 13:34:55.060109 8316 net.cpp:91] Creating Layer fc8 -I0906 13:34:55.060116 8316 net.cpp:411] fc8 <- fc7 -I0906 13:34:55.060132 8316 net.cpp:369] fc8 -> fc8 -I0906 13:34:55.060156 8316 net.cpp:121] Setting up fc8 -I0906 13:34:55.576926 8316 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:55.576946 8316 net.cpp:134] Memory required for data: 414324400 -I0906 13:34:55.576972 8316 layer_factory.hpp:74] Creating layer fc8_fc8_0_split -I0906 13:34:55.577006 8316 net.cpp:91] Creating Layer fc8_fc8_0_split -I0906 13:34:55.577097 8316 net.cpp:411] fc8_fc8_0_split <- fc8 -I0906 13:34:55.577136 8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 -I0906 13:34:55.577162 8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 -I0906 13:34:55.577173 8316 net.cpp:121] Setting up fc8_fc8_0_split -I0906 13:34:55.577191 8316 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:55.577198 8316 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:34:55.577201 8316 net.cpp:134] Memory required for data: 414724400 -I0906 13:34:55.577206 8316 layer_factory.hpp:74] Creating layer accuracy -I0906 13:34:55.577237 8316 net.cpp:91] Creating Layer accuracy -I0906 13:34:55.577244 8316 net.cpp:411] accuracy <- fc8_fc8_0_split_0 -I0906 13:34:55.577255 8316 net.cpp:411] accuracy <- label_data_1_split_0 -I0906 13:34:55.577266 8316 net.cpp:369] accuracy -> accuracy -I0906 13:34:55.577277 8316 net.cpp:121] Setting up accuracy -I0906 13:34:55.577293 8316 net.cpp:128] Top shape: (1) -I0906 13:34:55.577297 8316 net.cpp:134] Memory required for data: 414724404 -I0906 13:34:55.577302 8316 layer_factory.hpp:74] Creating layer loss -I0906 13:34:55.577314 8316 net.cpp:91] Creating Layer loss -I0906 13:34:55.577321 8316 net.cpp:411] loss <- fc8_fc8_0_split_1 -I0906 13:34:55.577332 8316 net.cpp:411] loss <- label_data_1_split_1 -I0906 13:34:55.577342 8316 net.cpp:369] loss -> loss -I0906 13:34:55.577353 8316 net.cpp:121] Setting up loss -I0906 13:34:55.577363 8316 layer_factory.hpp:74] Creating layer loss -I0906 13:34:55.577759 8316 net.cpp:128] Top shape: (1) -I0906 13:34:55.577764 8316 net.cpp:130] with loss weight 1 -I0906 13:34:55.577780 8316 net.cpp:134] Memory required for data: 414724408 -I0906 13:34:55.577786 8316 net.cpp:193] loss needs backward computation. -I0906 13:34:55.577795 8316 net.cpp:195] accuracy does not need backward computation. -I0906 13:34:55.577801 8316 net.cpp:193] fc8_fc8_0_split needs backward computation. -I0906 13:34:55.577807 8316 net.cpp:193] fc8 needs backward computation. -I0906 13:34:55.577813 8316 net.cpp:193] relu7 needs backward computation. -I0906 13:34:55.577818 8316 net.cpp:193] fc7 needs backward computation. -I0906 13:34:55.577824 8316 net.cpp:193] relu6 needs backward computation. -I0906 13:34:55.577831 8316 net.cpp:193] fc6 needs backward computation. -I0906 13:34:55.577836 8316 net.cpp:193] pool5 needs backward computation. -I0906 13:34:55.577842 8316 net.cpp:193] relu5 needs backward computation. -I0906 13:34:55.577847 8316 net.cpp:193] conv5 needs backward computation. -I0906 13:34:55.577853 8316 net.cpp:193] relu4 needs backward computation. -I0906 13:34:55.577859 8316 net.cpp:193] conv4 needs backward computation. -I0906 13:34:55.577864 8316 net.cpp:193] relu3 needs backward computation. -I0906 13:34:55.577870 8316 net.cpp:193] conv3 needs backward computation. -I0906 13:34:55.577877 8316 net.cpp:193] pool2 needs backward computation. -I0906 13:34:55.577883 8316 net.cpp:193] norm2 needs backward computation. -I0906 13:34:55.577888 8316 net.cpp:193] relu2 needs backward computation. -I0906 13:34:55.577893 8316 net.cpp:193] conv2 needs backward computation. -I0906 13:34:55.577899 8316 net.cpp:193] pool1 needs backward computation. -I0906 13:34:55.577905 8316 net.cpp:193] norm1 needs backward computation. -I0906 13:34:55.577911 8316 net.cpp:193] relu1 needs backward computation. -I0906 13:34:55.577916 8316 net.cpp:193] conv1 needs backward computation. -I0906 13:34:55.577924 8316 net.cpp:195] label_data_1_split does not need backward computation. -I0906 13:34:55.577931 8316 net.cpp:195] data does not need backward computation. -I0906 13:34:55.577936 8316 net.cpp:236] This network produces output accuracy -I0906 13:34:55.577942 8316 net.cpp:236] This network produces output loss -I0906 13:34:55.577977 8316 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:34:55.577991 8316 net.cpp:248] Network initialization done. -I0906 13:34:55.577996 8316 net.cpp:249] Memory required for data: 414724408 -I0906 13:34:55.578182 8316 solver.cpp:53] Solver scaffolding done. -I0906 13:34:55.578306 8316 solver.cpp:270] Solving AlexNet -I0906 13:34:55.578330 8316 solver.cpp:271] Learning Rate Policy: step -I0906 13:34:55.580096 8316 solver.cpp:314] Iteration 0, Testing net (#0) -I0906 13:34:55.580111 8316 net.cpp:696] Copying source layer data -I0906 13:34:55.580116 8316 net.cpp:696] Copying source layer conv1 -I0906 13:34:55.583168 8316 net.cpp:696] Copying source layer relu1 -I0906 13:34:55.583199 8316 net.cpp:696] Copying source layer norm1 -I0906 13:34:55.583204 8316 net.cpp:696] Copying source layer pool1 -I0906 13:34:55.583209 8316 net.cpp:696] Copying source layer conv2 -I0906 13:34:55.583320 8316 net.cpp:696] Copying source layer relu2 -I0906 13:34:55.583326 8316 net.cpp:696] Copying source layer norm2 -I0906 13:34:55.583331 8316 net.cpp:696] Copying source layer pool2 -I0906 13:34:55.583335 8316 net.cpp:696] Copying source layer conv3 -I0906 13:34:55.583690 8316 net.cpp:696] Copying source layer relu3 -I0906 13:34:55.583698 8316 net.cpp:696] Copying source layer conv4 -I0906 13:34:55.583895 8316 net.cpp:696] Copying source layer relu4 -I0906 13:34:55.583902 8316 net.cpp:696] Copying source layer conv5 -I0906 13:34:55.584177 8316 net.cpp:696] Copying source layer relu5 -I0906 13:34:55.584185 8316 net.cpp:696] Copying source layer pool5 -I0906 13:34:55.584189 8316 net.cpp:696] Copying source layer fc6 -I0906 13:34:55.589432 8316 net.cpp:696] Copying source layer relu6 -I0906 13:34:55.589460 8316 net.cpp:696] Copying source layer fc7 -I0906 13:34:55.592273 8316 net.cpp:696] Copying source layer relu7 -I0906 13:34:55.592288 8316 net.cpp:696] Copying source layer fc8 -I0906 13:34:55.593138 8316 net.cpp:696] Copying source layer loss -I0906 13:34:55.593260 8316 base_data_layer.cpp:89] Thread joined -I0906 13:34:55.597589 8316 base_data_layer.cpp:93] Prefetch copied -I0906 13:34:55.597887 8316 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:34:55.695569 8322 data_layer.cpp:120] Prefetch batch: 97 ms. -I0906 13:34:55.695600 8322 data_layer.cpp:121] Read time: 13.209 ms. -I0906 13:34:55.695606 8322 data_layer.cpp:122] Transform time: 83.025 ms. -I0906 13:34:58.623245 8316 solver.cpp:363] Test net output #0: accuracy = 0 -I0906 13:34:58.623273 8316 solver.cpp:363] Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss) -I0906 13:34:58.623322 8316 base_data_layer.cpp:89] Thread joined -I0906 13:34:58.632244 8316 base_data_layer.cpp:93] Prefetch copied -I0906 13:34:58.632606 8316 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:34:58.819707 8323 data_layer.cpp:120] Prefetch batch: 186 ms. -I0906 13:34:58.819741 8323 data_layer.cpp:121] Read time: 24.148 ms. -I0906 13:34:58.819747 8323 data_layer.cpp:122] Transform time: 161.152 ms. -I0906 13:35:05.407784 8316 solver.cpp:234] Iteration 0, loss = 0 -I0906 13:35:05.407842 8316 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) -I0906 13:35:05.407891 8316 solver.cpp:506] Iteration 0, lr = 0.01 -I0906 13:35:05.525874 8316 base_data_layer.cpp:89] Thread joined -I0906 13:35:05.533869 8316 base_data_layer.cpp:93] Prefetch copied -I0906 13:35:05.534140 8316 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:35:05.722632 8328 data_layer.cpp:120] Prefetch batch: 188 ms. -I0906 13:35:05.722664 8328 data_layer.cpp:121] Read time: 24.184 ms. -I0906 13:35:05.722672 8328 data_layer.cpp:122] Transform time: 162.257 ms. -I0906 13:35:08.300590 8316 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 deleted file mode 100644 index 6ec81c82..00000000 --- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 +++ /dev/null @@ -1,1160 +0,0 @@ -Log file created at: 2015/09/06 13:58:05 -Running on machine: AMD-RESEARCH -Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -I0906 13:58:05.835170 16515 caffe.cpp:114] Use GPU with device ID 0 -I0906 13:58:05.875704 16515 device.cpp:230] Number of platforms found:1 -I0906 13:58:05.875743 16515 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing -I0906 13:58:05.875757 16515 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE -I0906 13:58:05.875763 16515 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) -I0906 13:58:05.875769 16515 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. -I0906 13:58:05.875774 16515 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices -I0906 13:58:05.875783 16515 device.cpp:286] Number of devices found:1 -I0906 13:58:05.875788 16515 device.cpp:288] DeviceID: 0x18ab2f0 -I0906 13:58:05.875809 16515 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU -I0906 13:58:05.875818 16515 device.cpp:393] Is it integrated GPU?: 0 -I0906 13:58:05.875823 16515 device.cpp:393] Max clock frequency MHz: 930 -I0906 13:58:05.875829 16515 device.cpp:393] Host-Device unified mem: 0 -I0906 13:58:05.875834 16515 device.cpp:393] ECC support: 0 -I0906 13:58:05.875839 16515 device.cpp:393] Endian little: 1 -I0906 13:58:05.875844 16515 device.cpp:393] Max compute units: 44 -I0906 13:58:05.875849 16515 device.cpp:393] Max work group size: 256 -I0906 13:58:05.875856 16515 device.cpp:393] Max work item dimensions: 3 -I0906 13:58:05.875862 16515 device.cpp:393] Max work item sizes: 0x100 -I0906 13:58:05.875869 16515 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE -I0906 13:58:05.875875 16515 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL -I0906 13:58:05.875881 16515 device.cpp:393] Max mem alloc size: 4244635648 -I0906 13:58:05.875886 16515 device.cpp:393] Global mem size: 16878927872 -I0906 13:58:05.875891 16515 device.cpp:393] Local mem size: 32768 -I0906 13:58:05.875902 16515 device.cpp:96] Picked device type : GPU 0 -I0906 13:58:08.267483 16515 device.cpp:152] Build Program -I0906 13:58:08.267706 16515 caffe.cpp:122] Starting Optimization -I0906 13:58:08.267797 16515 solver.cpp:40] Initializing solver from parameters: -test_iter: 1 -test_interval: 1000 -base_lr: 0.01 -display: 1 -max_iter: 10 -lr_policy: "step" -gamma: 0.1 -momentum: 0.9 -weight_decay: 0.0005 -stepsize: 100000 -snapshot: 10000 -snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" -solver_mode: GPU -net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" -I0906 13:58:08.267910 16515 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:58:08.269042 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data -I0906 13:58:08.269093 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy -I0906 13:58:08.269273 16515 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TRAIN -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:58:08.269708 16515 net.cpp:68] Memory required for data: 0 -I0906 13:58:08.269917 16515 layer_factory.hpp:74] Creating layer data -I0906 13:58:08.269971 16515 net.cpp:91] Creating Layer data -I0906 13:58:08.269992 16515 net.cpp:369] data -> data -I0906 13:58:08.270097 16515 net.cpp:369] data -> label -I0906 13:58:08.270122 16515 net.cpp:121] Setting up data -I0906 13:58:08.270134 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:58:08.279337 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb -I0906 13:58:08.279680 16515 data_layer.cpp:53] output data size: 100,3,227,227 -I0906 13:58:08.311036 16515 base_data_layer.cpp:43] Initializing prefetch -I0906 13:58:08.311240 16515 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:58:08.311303 16515 net.cpp:128] Top shape: 100 3 227 227 (15458700) -I0906 13:58:08.311313 16515 net.cpp:128] Top shape: 100 (100) -I0906 13:58:08.311318 16515 net.cpp:134] Memory required for data: 61835200 -I0906 13:58:08.311352 16515 layer_factory.hpp:74] Creating layer conv1 -I0906 13:58:08.311431 16515 net.cpp:91] Creating Layer conv1 -I0906 13:58:08.311453 16515 net.cpp:411] conv1 <- data -I0906 13:58:08.311504 16515 net.cpp:369] conv1 -> conv1 -I0906 13:58:08.311569 16515 net.cpp:121] Setting up conv1 -I0906 13:58:08.316509 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:08.316515 16515 net.cpp:134] Memory required for data: 177995200 -I0906 13:58:08.316555 16515 layer_factory.hpp:74] Creating layer relu1 -I0906 13:58:08.316577 16515 net.cpp:91] Creating Layer relu1 -I0906 13:58:08.316583 16515 net.cpp:411] relu1 <- conv1 -I0906 13:58:08.316597 16515 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:58:08.316606 16515 net.cpp:121] Setting up relu1 -I0906 13:58:08.316615 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:08.316619 16515 net.cpp:134] Memory required for data: 294155200 -I0906 13:58:08.316623 16515 layer_factory.hpp:74] Creating layer norm1 -I0906 13:58:08.316653 16515 net.cpp:91] Creating Layer norm1 -I0906 13:58:08.316659 16515 net.cpp:411] norm1 <- conv1 -I0906 13:58:08.316673 16515 net.cpp:369] norm1 -> norm1 -I0906 13:58:08.316686 16515 net.cpp:121] Setting up norm1 -I0906 13:58:08.316710 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:08.316715 16515 net.cpp:134] Memory required for data: 410315200 -I0906 13:58:08.316720 16515 layer_factory.hpp:74] Creating layer pool1 -I0906 13:58:08.316745 16515 net.cpp:91] Creating Layer pool1 -I0906 13:58:08.316750 16515 net.cpp:411] pool1 <- norm1 -I0906 13:58:08.316763 16515 net.cpp:369] pool1 -> pool1 -I0906 13:58:08.316776 16515 net.cpp:121] Setting up pool1 -I0906 13:58:08.316805 16515 net.cpp:128] Top shape: 100 96 27 27 (6998400) -I0906 13:58:08.316809 16515 net.cpp:134] Memory required for data: 438308800 -I0906 13:58:08.316814 16515 layer_factory.hpp:74] Creating layer conv2 -I0906 13:58:08.316829 16515 net.cpp:91] Creating Layer conv2 -I0906 13:58:08.316834 16515 net.cpp:411] conv2 <- pool1 -I0906 13:58:08.316850 16515 net.cpp:369] conv2 -> conv2 -I0906 13:58:08.316862 16515 net.cpp:121] Setting up conv2 -I0906 13:58:08.356899 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:08.356914 16515 net.cpp:134] Memory required for data: 512958400 -I0906 13:58:08.356945 16515 layer_factory.hpp:74] Creating layer relu2 -I0906 13:58:08.356967 16515 net.cpp:91] Creating Layer relu2 -I0906 13:58:08.356978 16515 net.cpp:411] relu2 <- conv2 -I0906 13:58:08.356998 16515 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:58:08.357012 16515 net.cpp:121] Setting up relu2 -I0906 13:58:08.357022 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:08.357025 16515 net.cpp:134] Memory required for data: 587608000 -I0906 13:58:08.357030 16515 layer_factory.hpp:74] Creating layer norm2 -I0906 13:58:08.357046 16515 net.cpp:91] Creating Layer norm2 -I0906 13:58:08.357053 16515 net.cpp:411] norm2 <- conv2 -I0906 13:58:08.357066 16515 net.cpp:369] norm2 -> norm2 -I0906 13:58:08.357079 16515 net.cpp:121] Setting up norm2 -I0906 13:58:08.357108 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:08.357113 16515 net.cpp:134] Memory required for data: 662257600 -I0906 13:58:08.357118 16515 layer_factory.hpp:74] Creating layer pool2 -I0906 13:58:08.357146 16515 net.cpp:91] Creating Layer pool2 -I0906 13:58:08.357152 16515 net.cpp:411] pool2 <- norm2 -I0906 13:58:08.357166 16515 net.cpp:369] pool2 -> pool2 -I0906 13:58:08.357177 16515 net.cpp:121] Setting up pool2 -I0906 13:58:08.357200 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:08.357204 16515 net.cpp:134] Memory required for data: 679563200 -I0906 13:58:08.357259 16515 layer_factory.hpp:74] Creating layer conv3 -I0906 13:58:08.357281 16515 net.cpp:91] Creating Layer conv3 -I0906 13:58:08.357287 16515 net.cpp:411] conv3 <- pool2 -I0906 13:58:08.357303 16515 net.cpp:369] conv3 -> conv3 -I0906 13:58:08.357318 16515 net.cpp:121] Setting up conv3 -I0906 13:58:08.475977 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:08.475999 16515 net.cpp:134] Memory required for data: 705521600 -I0906 13:58:08.476043 16515 layer_factory.hpp:74] Creating layer relu3 -I0906 13:58:08.476078 16515 net.cpp:91] Creating Layer relu3 -I0906 13:58:08.476093 16515 net.cpp:411] relu3 <- conv3 -I0906 13:58:08.476120 16515 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:58:08.476137 16515 net.cpp:121] Setting up relu3 -I0906 13:58:08.476147 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:08.476151 16515 net.cpp:134] Memory required for data: 731480000 -I0906 13:58:08.476156 16515 layer_factory.hpp:74] Creating layer conv4 -I0906 13:58:08.476184 16515 net.cpp:91] Creating Layer conv4 -I0906 13:58:08.476191 16515 net.cpp:411] conv4 <- conv3 -I0906 13:58:08.476207 16515 net.cpp:369] conv4 -> conv4 -I0906 13:58:08.476222 16515 net.cpp:121] Setting up conv4 -I0906 13:58:08.500998 16519 data_layer.cpp:120] Prefetch batch: 189 ms. -I0906 13:58:08.501045 16519 data_layer.cpp:121] Read time: 23.893 ms. -I0906 13:58:08.501054 16519 data_layer.cpp:122] Transform time: 163.51 ms. -I0906 13:58:08.563753 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:08.563774 16515 net.cpp:134] Memory required for data: 757438400 -I0906 13:58:08.563802 16515 layer_factory.hpp:74] Creating layer relu4 -I0906 13:58:08.563835 16515 net.cpp:91] Creating Layer relu4 -I0906 13:58:08.563849 16515 net.cpp:411] relu4 <- conv4 -I0906 13:58:08.563876 16515 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:58:08.563892 16515 net.cpp:121] Setting up relu4 -I0906 13:58:08.563902 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:08.563906 16515 net.cpp:134] Memory required for data: 783396800 -I0906 13:58:08.563911 16515 layer_factory.hpp:74] Creating layer conv5 -I0906 13:58:08.563946 16515 net.cpp:91] Creating Layer conv5 -I0906 13:58:08.563951 16515 net.cpp:411] conv5 <- conv4 -I0906 13:58:08.563968 16515 net.cpp:369] conv5 -> conv5 -I0906 13:58:08.563982 16515 net.cpp:121] Setting up conv5 -I0906 13:58:08.621495 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:08.621512 16515 net.cpp:134] Memory required for data: 800702400 -I0906 13:58:08.621553 16515 layer_factory.hpp:74] Creating layer relu5 -I0906 13:58:08.621584 16515 net.cpp:91] Creating Layer relu5 -I0906 13:58:08.621598 16515 net.cpp:411] relu5 <- conv5 -I0906 13:58:08.621623 16515 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:58:08.621639 16515 net.cpp:121] Setting up relu5 -I0906 13:58:08.621649 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:08.621652 16515 net.cpp:134] Memory required for data: 818008000 -I0906 13:58:08.621657 16515 layer_factory.hpp:74] Creating layer pool5 -I0906 13:58:08.621677 16515 net.cpp:91] Creating Layer pool5 -I0906 13:58:08.621683 16515 net.cpp:411] pool5 <- conv5 -I0906 13:58:08.621697 16515 net.cpp:369] pool5 -> pool5 -I0906 13:58:08.621711 16515 net.cpp:121] Setting up pool5 -I0906 13:58:08.621732 16515 net.cpp:128] Top shape: 100 256 6 6 (921600) -I0906 13:58:08.621737 16515 net.cpp:134] Memory required for data: 821694400 -I0906 13:58:08.621742 16515 layer_factory.hpp:74] Creating layer fc6 -I0906 13:58:08.621778 16515 net.cpp:91] Creating Layer fc6 -I0906 13:58:08.621783 16515 net.cpp:411] fc6 <- pool5 -I0906 13:58:08.621798 16515 net.cpp:369] fc6 -> fc6 -I0906 13:58:08.621812 16515 net.cpp:121] Setting up fc6 -I0906 13:58:13.492439 16515 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:58:13.492465 16515 net.cpp:134] Memory required for data: 823332800 -I0906 13:58:13.492493 16515 layer_factory.hpp:74] Creating layer relu6 -I0906 13:58:13.492527 16515 net.cpp:91] Creating Layer relu6 -I0906 13:58:13.492542 16515 net.cpp:411] relu6 <- fc6 -I0906 13:58:13.492568 16515 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:58:13.492630 16515 net.cpp:121] Setting up relu6 -I0906 13:58:13.492640 16515 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:58:13.492643 16515 net.cpp:134] Memory required for data: 824971200 -I0906 13:58:13.492648 16515 layer_factory.hpp:74] Creating layer fc7 -I0906 13:58:13.492671 16515 net.cpp:91] Creating Layer fc7 -I0906 13:58:13.492677 16515 net.cpp:411] fc7 <- fc6 -I0906 13:58:13.492693 16515 net.cpp:369] fc7 -> fc7 -I0906 13:58:13.492708 16515 net.cpp:121] Setting up fc7 -I0906 13:58:15.661120 16515 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:58:15.661144 16515 net.cpp:134] Memory required for data: 826609600 -I0906 13:58:15.661171 16515 layer_factory.hpp:74] Creating layer relu7 -I0906 13:58:15.661205 16515 net.cpp:91] Creating Layer relu7 -I0906 13:58:15.661221 16515 net.cpp:411] relu7 <- fc7 -I0906 13:58:15.661247 16515 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:58:15.661263 16515 net.cpp:121] Setting up relu7 -I0906 13:58:15.661273 16515 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:58:15.661276 16515 net.cpp:134] Memory required for data: 828248000 -I0906 13:58:15.661281 16515 layer_factory.hpp:74] Creating layer fc8 -I0906 13:58:15.661304 16515 net.cpp:91] Creating Layer fc8 -I0906 13:58:15.661310 16515 net.cpp:411] fc8 <- fc7 -I0906 13:58:15.661325 16515 net.cpp:369] fc8 -> fc8 -I0906 13:58:15.661340 16515 net.cpp:121] Setting up fc8 -I0906 13:58:16.190832 16515 net.cpp:128] Top shape: 100 1000 (100000) -I0906 13:58:16.190855 16515 net.cpp:134] Memory required for data: 828648000 -I0906 13:58:16.190881 16515 layer_factory.hpp:74] Creating layer loss -I0906 13:58:16.190932 16515 net.cpp:91] Creating Layer loss -I0906 13:58:16.190946 16515 net.cpp:411] loss <- fc8 -I0906 13:58:16.190969 16515 net.cpp:411] loss <- label -I0906 13:58:16.190989 16515 net.cpp:369] loss -> loss -I0906 13:58:16.191009 16515 net.cpp:121] Setting up loss -I0906 13:58:16.191030 16515 layer_factory.hpp:74] Creating layer loss -I0906 13:58:16.191588 16515 net.cpp:128] Top shape: (1) -I0906 13:58:16.191593 16515 net.cpp:130] with loss weight 1 -I0906 13:58:16.191611 16515 net.cpp:134] Memory required for data: 828648004 -I0906 13:58:16.191619 16515 net.cpp:193] loss needs backward computation. -I0906 13:58:16.191627 16515 net.cpp:193] fc8 needs backward computation. -I0906 13:58:16.191633 16515 net.cpp:193] relu7 needs backward computation. -I0906 13:58:16.191639 16515 net.cpp:193] fc7 needs backward computation. -I0906 13:58:16.191644 16515 net.cpp:193] relu6 needs backward computation. -I0906 13:58:16.191650 16515 net.cpp:193] fc6 needs backward computation. -I0906 13:58:16.191655 16515 net.cpp:193] pool5 needs backward computation. -I0906 13:58:16.191661 16515 net.cpp:193] relu5 needs backward computation. -I0906 13:58:16.191666 16515 net.cpp:193] conv5 needs backward computation. -I0906 13:58:16.191673 16515 net.cpp:193] relu4 needs backward computation. -I0906 13:58:16.191678 16515 net.cpp:193] conv4 needs backward computation. -I0906 13:58:16.191684 16515 net.cpp:193] relu3 needs backward computation. -I0906 13:58:16.191689 16515 net.cpp:193] conv3 needs backward computation. -I0906 13:58:16.191696 16515 net.cpp:193] pool2 needs backward computation. -I0906 13:58:16.191702 16515 net.cpp:193] norm2 needs backward computation. -I0906 13:58:16.191709 16515 net.cpp:193] relu2 needs backward computation. -I0906 13:58:16.191714 16515 net.cpp:193] conv2 needs backward computation. -I0906 13:58:16.191720 16515 net.cpp:193] pool1 needs backward computation. -I0906 13:58:16.191725 16515 net.cpp:193] norm1 needs backward computation. -I0906 13:58:16.191731 16515 net.cpp:193] relu1 needs backward computation. -I0906 13:58:16.191737 16515 net.cpp:193] conv1 needs backward computation. -I0906 13:58:16.191745 16515 net.cpp:195] data does not need backward computation. -I0906 13:58:16.191753 16515 net.cpp:236] This network produces output loss -I0906 13:58:16.191787 16515 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:58:16.191803 16515 net.cpp:248] Network initialization done. -I0906 13:58:16.191807 16515 net.cpp:249] Memory required for data: 828648004 -I0906 13:58:16.192769 16515 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:58:16.192881 16515 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data -I0906 13:58:16.193114 16515 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TEST -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "fc8" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:58:16.193480 16515 net.cpp:68] Memory required for data: 0 -I0906 13:58:16.193527 16515 layer_factory.hpp:74] Creating layer data -I0906 13:58:16.193549 16515 net.cpp:91] Creating Layer data -I0906 13:58:16.193559 16515 net.cpp:369] data -> data -I0906 13:58:16.193583 16515 net.cpp:369] data -> label -I0906 13:58:16.193595 16515 net.cpp:121] Setting up data -I0906 13:58:16.193603 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:58:16.202100 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb -I0906 13:58:16.202343 16515 data_layer.cpp:53] output data size: 50,3,227,227 -I0906 13:58:16.219017 16515 base_data_layer.cpp:43] Initializing prefetch -I0906 13:58:16.219137 16515 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:58:16.219171 16515 net.cpp:128] Top shape: 50 3 227 227 (7729350) -I0906 13:58:16.219179 16515 net.cpp:128] Top shape: 50 (50) -I0906 13:58:16.219183 16515 net.cpp:134] Memory required for data: 30917600 -I0906 13:58:16.219214 16515 layer_factory.hpp:74] Creating layer label_data_1_split -I0906 13:58:16.219279 16515 net.cpp:91] Creating Layer label_data_1_split -I0906 13:58:16.219293 16515 net.cpp:411] label_data_1_split <- label -I0906 13:58:16.219367 16515 net.cpp:369] label_data_1_split -> label_data_1_split_0 -I0906 13:58:16.219409 16515 net.cpp:369] label_data_1_split -> label_data_1_split_1 -I0906 13:58:16.219420 16515 net.cpp:121] Setting up label_data_1_split -I0906 13:58:16.219455 16515 net.cpp:128] Top shape: 50 (50) -I0906 13:58:16.219462 16515 net.cpp:128] Top shape: 50 (50) -I0906 13:58:16.219466 16515 net.cpp:134] Memory required for data: 30918000 -I0906 13:58:16.219471 16515 layer_factory.hpp:74] Creating layer conv1 -I0906 13:58:16.219508 16515 net.cpp:91] Creating Layer conv1 -I0906 13:58:16.219513 16515 net.cpp:411] conv1 <- data -I0906 13:58:16.219530 16515 net.cpp:369] conv1 -> conv1 -I0906 13:58:16.219545 16515 net.cpp:121] Setting up conv1 -I0906 13:58:16.224315 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:58:16.224321 16515 net.cpp:134] Memory required for data: 88998000 -I0906 13:58:16.224341 16515 layer_factory.hpp:74] Creating layer relu1 -I0906 13:58:16.224354 16515 net.cpp:91] Creating Layer relu1 -I0906 13:58:16.224360 16515 net.cpp:411] relu1 <- conv1 -I0906 13:58:16.224372 16515 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:58:16.224382 16515 net.cpp:121] Setting up relu1 -I0906 13:58:16.224390 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:58:16.224393 16515 net.cpp:134] Memory required for data: 147078000 -I0906 13:58:16.224398 16515 layer_factory.hpp:74] Creating layer norm1 -I0906 13:58:16.224417 16515 net.cpp:91] Creating Layer norm1 -I0906 13:58:16.224423 16515 net.cpp:411] norm1 <- conv1 -I0906 13:58:16.224436 16515 net.cpp:369] norm1 -> norm1 -I0906 13:58:16.224447 16515 net.cpp:121] Setting up norm1 -I0906 13:58:16.224465 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:58:16.224508 16515 net.cpp:134] Memory required for data: 205158000 -I0906 13:58:16.224514 16515 layer_factory.hpp:74] Creating layer pool1 -I0906 13:58:16.224529 16515 net.cpp:91] Creating Layer pool1 -I0906 13:58:16.224534 16515 net.cpp:411] pool1 <- norm1 -I0906 13:58:16.224547 16515 net.cpp:369] pool1 -> pool1 -I0906 13:58:16.224558 16515 net.cpp:121] Setting up pool1 -I0906 13:58:16.224576 16515 net.cpp:128] Top shape: 50 96 27 27 (3499200) -I0906 13:58:16.224581 16515 net.cpp:134] Memory required for data: 219154800 -I0906 13:58:16.224586 16515 layer_factory.hpp:74] Creating layer conv2 -I0906 13:58:16.224601 16515 net.cpp:91] Creating Layer conv2 -I0906 13:58:16.224606 16515 net.cpp:411] conv2 <- pool1 -I0906 13:58:16.224620 16515 net.cpp:369] conv2 -> conv2 -I0906 13:58:16.224632 16515 net.cpp:121] Setting up conv2 -I0906 13:58:16.264878 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:58:16.264889 16515 net.cpp:134] Memory required for data: 256479600 -I0906 13:58:16.264916 16515 layer_factory.hpp:74] Creating layer relu2 -I0906 13:58:16.264937 16515 net.cpp:91] Creating Layer relu2 -I0906 13:58:16.264946 16515 net.cpp:411] relu2 <- conv2 -I0906 13:58:16.264966 16515 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:58:16.264978 16515 net.cpp:121] Setting up relu2 -I0906 13:58:16.264987 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:58:16.264991 16515 net.cpp:134] Memory required for data: 293804400 -I0906 13:58:16.264997 16515 layer_factory.hpp:74] Creating layer norm2 -I0906 13:58:16.265015 16515 net.cpp:91] Creating Layer norm2 -I0906 13:58:16.265022 16515 net.cpp:411] norm2 <- conv2 -I0906 13:58:16.265035 16515 net.cpp:369] norm2 -> norm2 -I0906 13:58:16.265050 16515 net.cpp:121] Setting up norm2 -I0906 13:58:16.265072 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:58:16.265077 16515 net.cpp:134] Memory required for data: 331129200 -I0906 13:58:16.265082 16515 layer_factory.hpp:74] Creating layer pool2 -I0906 13:58:16.265097 16515 net.cpp:91] Creating Layer pool2 -I0906 13:58:16.265103 16515 net.cpp:411] pool2 <- norm2 -I0906 13:58:16.265116 16515 net.cpp:369] pool2 -> pool2 -I0906 13:58:16.265127 16515 net.cpp:121] Setting up pool2 -I0906 13:58:16.265149 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:58:16.265153 16515 net.cpp:134] Memory required for data: 339782000 -I0906 13:58:16.265158 16515 layer_factory.hpp:74] Creating layer conv3 -I0906 13:58:16.265179 16515 net.cpp:91] Creating Layer conv3 -I0906 13:58:16.265184 16515 net.cpp:411] conv3 <- pool2 -I0906 13:58:16.265200 16515 net.cpp:369] conv3 -> conv3 -I0906 13:58:16.265213 16515 net.cpp:121] Setting up conv3 -I0906 13:58:16.312928 16520 data_layer.cpp:120] Prefetch batch: 93 ms. -I0906 13:58:16.312959 16520 data_layer.cpp:121] Read time: 12.075 ms. -I0906 13:58:16.312966 16520 data_layer.cpp:122] Transform time: 80.513 ms. -I0906 13:58:16.381564 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:58:16.381587 16515 net.cpp:134] Memory required for data: 352761200 -I0906 13:58:16.381628 16515 layer_factory.hpp:74] Creating layer relu3 -I0906 13:58:16.381660 16515 net.cpp:91] Creating Layer relu3 -I0906 13:58:16.381675 16515 net.cpp:411] relu3 <- conv3 -I0906 13:58:16.381700 16515 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:58:16.381717 16515 net.cpp:121] Setting up relu3 -I0906 13:58:16.381726 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:58:16.381731 16515 net.cpp:134] Memory required for data: 365740400 -I0906 13:58:16.381734 16515 layer_factory.hpp:74] Creating layer conv4 -I0906 13:58:16.381762 16515 net.cpp:91] Creating Layer conv4 -I0906 13:58:16.381767 16515 net.cpp:411] conv4 <- conv3 -I0906 13:58:16.381783 16515 net.cpp:369] conv4 -> conv4 -I0906 13:58:16.381798 16515 net.cpp:121] Setting up conv4 -I0906 13:58:16.468471 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:58:16.468492 16515 net.cpp:134] Memory required for data: 378719600 -I0906 13:58:16.468518 16515 layer_factory.hpp:74] Creating layer relu4 -I0906 13:58:16.468550 16515 net.cpp:91] Creating Layer relu4 -I0906 13:58:16.468605 16515 net.cpp:411] relu4 <- conv4 -I0906 13:58:16.468633 16515 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:58:16.468649 16515 net.cpp:121] Setting up relu4 -I0906 13:58:16.468658 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:58:16.468662 16515 net.cpp:134] Memory required for data: 391698800 -I0906 13:58:16.468667 16515 layer_factory.hpp:74] Creating layer conv5 -I0906 13:58:16.468694 16515 net.cpp:91] Creating Layer conv5 -I0906 13:58:16.468700 16515 net.cpp:411] conv5 <- conv4 -I0906 13:58:16.468716 16515 net.cpp:369] conv5 -> conv5 -I0906 13:58:16.468731 16515 net.cpp:121] Setting up conv5 -I0906 13:58:16.526487 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:58:16.526507 16515 net.cpp:134] Memory required for data: 400351600 -I0906 13:58:16.526547 16515 layer_factory.hpp:74] Creating layer relu5 -I0906 13:58:16.526577 16515 net.cpp:91] Creating Layer relu5 -I0906 13:58:16.526590 16515 net.cpp:411] relu5 <- conv5 -I0906 13:58:16.526614 16515 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:58:16.526630 16515 net.cpp:121] Setting up relu5 -I0906 13:58:16.526639 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:58:16.526643 16515 net.cpp:134] Memory required for data: 409004400 -I0906 13:58:16.526648 16515 layer_factory.hpp:74] Creating layer pool5 -I0906 13:58:16.526676 16515 net.cpp:91] Creating Layer pool5 -I0906 13:58:16.526682 16515 net.cpp:411] pool5 <- conv5 -I0906 13:58:16.526696 16515 net.cpp:369] pool5 -> pool5 -I0906 13:58:16.526710 16515 net.cpp:121] Setting up pool5 -I0906 13:58:16.526731 16515 net.cpp:128] Top shape: 50 256 6 6 (460800) -I0906 13:58:16.526734 16515 net.cpp:134] Memory required for data: 410847600 -I0906 13:58:16.526739 16515 layer_factory.hpp:74] Creating layer fc6 -I0906 13:58:16.526762 16515 net.cpp:91] Creating Layer fc6 -I0906 13:58:16.526767 16515 net.cpp:411] fc6 <- pool5 -I0906 13:58:16.526782 16515 net.cpp:369] fc6 -> fc6 -I0906 13:58:16.526794 16515 net.cpp:121] Setting up fc6 -I0906 13:58:21.365124 16515 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:58:21.365149 16515 net.cpp:134] Memory required for data: 411666800 -I0906 13:58:21.365176 16515 layer_factory.hpp:74] Creating layer relu6 -I0906 13:58:21.365211 16515 net.cpp:91] Creating Layer relu6 -I0906 13:58:21.365226 16515 net.cpp:411] relu6 <- fc6 -I0906 13:58:21.365250 16515 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:58:21.365267 16515 net.cpp:121] Setting up relu6 -I0906 13:58:21.365277 16515 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:58:21.365280 16515 net.cpp:134] Memory required for data: 412486000 -I0906 13:58:21.365285 16515 layer_factory.hpp:74] Creating layer fc7 -I0906 13:58:21.365309 16515 net.cpp:91] Creating Layer fc7 -I0906 13:58:21.365314 16515 net.cpp:411] fc7 <- fc6 -I0906 13:58:21.365330 16515 net.cpp:369] fc7 -> fc7 -I0906 13:58:21.365345 16515 net.cpp:121] Setting up fc7 -I0906 13:58:23.510701 16515 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:58:23.510725 16515 net.cpp:134] Memory required for data: 413305200 -I0906 13:58:23.510752 16515 layer_factory.hpp:74] Creating layer relu7 -I0906 13:58:23.510785 16515 net.cpp:91] Creating Layer relu7 -I0906 13:58:23.510800 16515 net.cpp:411] relu7 <- fc7 -I0906 13:58:23.510828 16515 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:58:23.510844 16515 net.cpp:121] Setting up relu7 -I0906 13:58:23.510854 16515 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:58:23.510857 16515 net.cpp:134] Memory required for data: 414124400 -I0906 13:58:23.510862 16515 layer_factory.hpp:74] Creating layer fc8 -I0906 13:58:23.510885 16515 net.cpp:91] Creating Layer fc8 -I0906 13:58:23.510890 16515 net.cpp:411] fc8 <- fc7 -I0906 13:58:23.510906 16515 net.cpp:369] fc8 -> fc8 -I0906 13:58:23.510932 16515 net.cpp:121] Setting up fc8 -I0906 13:58:24.034812 16515 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:58:24.034833 16515 net.cpp:134] Memory required for data: 414324400 -I0906 13:58:24.034860 16515 layer_factory.hpp:74] Creating layer fc8_fc8_0_split -I0906 13:58:24.034893 16515 net.cpp:91] Creating Layer fc8_fc8_0_split -I0906 13:58:24.034958 16515 net.cpp:411] fc8_fc8_0_split <- fc8 -I0906 13:58:24.034988 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 -I0906 13:58:24.035012 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 -I0906 13:58:24.035023 16515 net.cpp:121] Setting up fc8_fc8_0_split -I0906 13:58:24.035040 16515 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:58:24.035046 16515 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:58:24.035050 16515 net.cpp:134] Memory required for data: 414724400 -I0906 13:58:24.035055 16515 layer_factory.hpp:74] Creating layer accuracy -I0906 13:58:24.035086 16515 net.cpp:91] Creating Layer accuracy -I0906 13:58:24.035092 16515 net.cpp:411] accuracy <- fc8_fc8_0_split_0 -I0906 13:58:24.035104 16515 net.cpp:411] accuracy <- label_data_1_split_0 -I0906 13:58:24.035115 16515 net.cpp:369] accuracy -> accuracy -I0906 13:58:24.035126 16515 net.cpp:121] Setting up accuracy -I0906 13:58:24.035143 16515 net.cpp:128] Top shape: (1) -I0906 13:58:24.035147 16515 net.cpp:134] Memory required for data: 414724404 -I0906 13:58:24.035152 16515 layer_factory.hpp:74] Creating layer loss -I0906 13:58:24.035163 16515 net.cpp:91] Creating Layer loss -I0906 13:58:24.035168 16515 net.cpp:411] loss <- fc8_fc8_0_split_1 -I0906 13:58:24.035179 16515 net.cpp:411] loss <- label_data_1_split_1 -I0906 13:58:24.035190 16515 net.cpp:369] loss -> loss -I0906 13:58:24.035202 16515 net.cpp:121] Setting up loss -I0906 13:58:24.035212 16515 layer_factory.hpp:74] Creating layer loss -I0906 13:58:24.035562 16515 net.cpp:128] Top shape: (1) -I0906 13:58:24.035567 16515 net.cpp:130] with loss weight 1 -I0906 13:58:24.035583 16515 net.cpp:134] Memory required for data: 414724408 -I0906 13:58:24.035591 16515 net.cpp:193] loss needs backward computation. -I0906 13:58:24.035598 16515 net.cpp:195] accuracy does not need backward computation. -I0906 13:58:24.035605 16515 net.cpp:193] fc8_fc8_0_split needs backward computation. -I0906 13:58:24.035610 16515 net.cpp:193] fc8 needs backward computation. -I0906 13:58:24.035616 16515 net.cpp:193] relu7 needs backward computation. -I0906 13:58:24.035621 16515 net.cpp:193] fc7 needs backward computation. -I0906 13:58:24.035627 16515 net.cpp:193] relu6 needs backward computation. -I0906 13:58:24.035634 16515 net.cpp:193] fc6 needs backward computation. -I0906 13:58:24.035640 16515 net.cpp:193] pool5 needs backward computation. -I0906 13:58:24.035645 16515 net.cpp:193] relu5 needs backward computation. -I0906 13:58:24.035651 16515 net.cpp:193] conv5 needs backward computation. -I0906 13:58:24.035656 16515 net.cpp:193] relu4 needs backward computation. -I0906 13:58:24.035662 16515 net.cpp:193] conv4 needs backward computation. -I0906 13:58:24.035668 16515 net.cpp:193] relu3 needs backward computation. -I0906 13:58:24.035673 16515 net.cpp:193] conv3 needs backward computation. -I0906 13:58:24.035679 16515 net.cpp:193] pool2 needs backward computation. -I0906 13:58:24.035686 16515 net.cpp:193] norm2 needs backward computation. -I0906 13:58:24.035692 16515 net.cpp:193] relu2 needs backward computation. -I0906 13:58:24.035697 16515 net.cpp:193] conv2 needs backward computation. -I0906 13:58:24.035703 16515 net.cpp:193] pool1 needs backward computation. -I0906 13:58:24.035709 16515 net.cpp:193] norm1 needs backward computation. -I0906 13:58:24.035715 16515 net.cpp:193] relu1 needs backward computation. -I0906 13:58:24.035720 16515 net.cpp:193] conv1 needs backward computation. -I0906 13:58:24.035727 16515 net.cpp:195] label_data_1_split does not need backward computation. -I0906 13:58:24.035734 16515 net.cpp:195] data does not need backward computation. -I0906 13:58:24.035739 16515 net.cpp:236] This network produces output accuracy -I0906 13:58:24.035745 16515 net.cpp:236] This network produces output loss -I0906 13:58:24.035781 16515 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:58:24.035796 16515 net.cpp:248] Network initialization done. -I0906 13:58:24.035799 16515 net.cpp:249] Memory required for data: 414724408 -I0906 13:58:24.036000 16515 solver.cpp:53] Solver scaffolding done. -I0906 13:58:24.036130 16515 solver.cpp \ No newline at end of file diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 deleted file mode 100644 index d142f7c0..00000000 --- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 +++ /dev/null @@ -1,1208 +0,0 @@ -Log file created at: 2015/09/06 13:58:55 -Running on machine: AMD-RESEARCH -Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -I0906 13:58:55.707435 16537 caffe.cpp:114] Use GPU with device ID 0 -I0906 13:58:55.745967 16537 device.cpp:230] Number of platforms found:1 -I0906 13:58:55.746011 16537 device.cpp:262] CL_PLATFORM_NAME AMD Accelerated Parallel Processing -I0906 13:58:55.746028 16537 device.cpp:262] CL_PLATFORM_PROFILE FULL_PROFILE -I0906 13:58:55.746036 16537 device.cpp:262] CL_PLATFORM_VERSION OpenCL 2.0 AMD-APP.internal (1644.0) -I0906 13:58:55.746042 16537 device.cpp:262] CL_PLATFORM_VENDOR Advanced Micro Devices, Inc. -I0906 13:58:55.746048 16537 device.cpp:262] CL_PLATFORM_EXTENSIONS cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices -I0906 13:58:55.746059 16537 device.cpp:286] Number of devices found:1 -I0906 13:58:55.746064 16537 device.cpp:288] DeviceID: 0x18262f0 -I0906 13:58:55.746088 16537 device.cpp:366] Device Type: CL_DEVICE_TYPE_GPU -I0906 13:58:55.746098 16537 device.cpp:393] Is it integrated GPU?: 0 -I0906 13:58:55.746105 16537 device.cpp:393] Max clock frequency MHz: 930 -I0906 13:58:55.746111 16537 device.cpp:393] Host-Device unified mem: 0 -I0906 13:58:55.746117 16537 device.cpp:393] ECC support: 0 -I0906 13:58:55.746124 16537 device.cpp:393] Endian little: 1 -I0906 13:58:55.746130 16537 device.cpp:393] Max compute units: 44 -I0906 13:58:55.746136 16537 device.cpp:393] Max work group size: 256 -I0906 13:58:55.746145 16537 device.cpp:393] Max work item dimensions: 3 -I0906 13:58:55.746151 16537 device.cpp:393] Max work item sizes: 0x100 -I0906 13:58:55.746160 16537 device.cpp:389] CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE -I0906 13:58:55.746167 16537 device.cpp:378] CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL -I0906 13:58:55.746173 16537 device.cpp:393] Max mem alloc size: 4244635648 -I0906 13:58:55.746179 16537 device.cpp:393] Global mem size: 16878927872 -I0906 13:58:55.746186 16537 device.cpp:393] Local mem size: 32768 -I0906 13:58:55.746198 16537 device.cpp:96] Picked device type : GPU 0 -I0906 13:58:58.131669 16537 device.cpp:152] Build Program -I0906 13:58:58.131891 16537 caffe.cpp:122] Starting Optimization -I0906 13:58:58.132027 16537 solver.cpp:40] Initializing solver from parameters: -test_iter: 1 -test_interval: 1000 -base_lr: 0.01 -display: 1 -max_iter: 10 -lr_policy: "step" -gamma: 0.1 -momentum: 0.9 -weight_decay: 0.0005 -stepsize: 100000 -snapshot: 10000 -snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" -solver_mode: GPU -net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" -I0906 13:58:58.132150 16537 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:58:58.133236 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data -I0906 13:58:58.133285 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy -I0906 13:58:58.133460 16537 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TRAIN -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:58:58.133894 16537 net.cpp:68] Memory required for data: 0 -I0906 13:58:58.134050 16537 layer_factory.hpp:74] Creating layer data -I0906 13:58:58.134104 16537 net.cpp:91] Creating Layer data -I0906 13:58:58.134125 16537 net.cpp:369] data -> data -I0906 13:58:58.134229 16537 net.cpp:369] data -> label -I0906 13:58:58.134253 16537 net.cpp:121] Setting up data -I0906 13:58:58.134266 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:58:58.143668 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb -I0906 13:58:58.144057 16537 data_layer.cpp:53] output data size: 100,3,227,227 -I0906 13:58:58.175259 16537 base_data_layer.cpp:43] Initializing prefetch -I0906 13:58:58.175475 16537 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:58:58.175534 16537 net.cpp:128] Top shape: 100 3 227 227 (15458700) -I0906 13:58:58.175544 16537 net.cpp:128] Top shape: 100 (100) -I0906 13:58:58.175547 16537 net.cpp:134] Memory required for data: 61835200 -I0906 13:58:58.175582 16537 layer_factory.hpp:74] Creating layer conv1 -I0906 13:58:58.175659 16537 net.cpp:91] Creating Layer conv1 -I0906 13:58:58.175683 16537 net.cpp:411] conv1 <- data -I0906 13:58:58.175760 16537 net.cpp:369] conv1 -> conv1 -I0906 13:58:58.175793 16537 net.cpp:121] Setting up conv1 -I0906 13:58:58.180706 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:58.180712 16537 net.cpp:134] Memory required for data: 177995200 -I0906 13:58:58.180752 16537 layer_factory.hpp:74] Creating layer relu1 -I0906 13:58:58.180774 16537 net.cpp:91] Creating Layer relu1 -I0906 13:58:58.180780 16537 net.cpp:411] relu1 <- conv1 -I0906 13:58:58.180794 16537 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:58:58.180804 16537 net.cpp:121] Setting up relu1 -I0906 13:58:58.180811 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:58.180815 16537 net.cpp:134] Memory required for data: 294155200 -I0906 13:58:58.180821 16537 layer_factory.hpp:74] Creating layer norm1 -I0906 13:58:58.180848 16537 net.cpp:91] Creating Layer norm1 -I0906 13:58:58.180855 16537 net.cpp:411] norm1 <- conv1 -I0906 13:58:58.180867 16537 net.cpp:369] norm1 -> norm1 -I0906 13:58:58.180881 16537 net.cpp:121] Setting up norm1 -I0906 13:58:58.180905 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000) -I0906 13:58:58.180909 16537 net.cpp:134] Memory required for data: 410315200 -I0906 13:58:58.180915 16537 layer_factory.hpp:74] Creating layer pool1 -I0906 13:58:58.180938 16537 net.cpp:91] Creating Layer pool1 -I0906 13:58:58.180944 16537 net.cpp:411] pool1 <- norm1 -I0906 13:58:58.180958 16537 net.cpp:369] pool1 -> pool1 -I0906 13:58:58.180970 16537 net.cpp:121] Setting up pool1 -I0906 13:58:58.180999 16537 net.cpp:128] Top shape: 100 96 27 27 (6998400) -I0906 13:58:58.181004 16537 net.cpp:134] Memory required for data: 438308800 -I0906 13:58:58.181008 16537 layer_factory.hpp:74] Creating layer conv2 -I0906 13:58:58.181023 16537 net.cpp:91] Creating Layer conv2 -I0906 13:58:58.181030 16537 net.cpp:411] conv2 <- pool1 -I0906 13:58:58.181044 16537 net.cpp:369] conv2 -> conv2 -I0906 13:58:58.181056 16537 net.cpp:121] Setting up conv2 -I0906 13:58:58.221200 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:58.221215 16537 net.cpp:134] Memory required for data: 512958400 -I0906 13:58:58.221245 16537 layer_factory.hpp:74] Creating layer relu2 -I0906 13:58:58.221267 16537 net.cpp:91] Creating Layer relu2 -I0906 13:58:58.221277 16537 net.cpp:411] relu2 <- conv2 -I0906 13:58:58.221297 16537 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:58:58.221312 16537 net.cpp:121] Setting up relu2 -I0906 13:58:58.221320 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:58.221324 16537 net.cpp:134] Memory required for data: 587608000 -I0906 13:58:58.221329 16537 layer_factory.hpp:74] Creating layer norm2 -I0906 13:58:58.221346 16537 net.cpp:91] Creating Layer norm2 -I0906 13:58:58.221352 16537 net.cpp:411] norm2 <- conv2 -I0906 13:58:58.221366 16537 net.cpp:369] norm2 -> norm2 -I0906 13:58:58.221379 16537 net.cpp:121] Setting up norm2 -I0906 13:58:58.221397 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400) -I0906 13:58:58.221402 16537 net.cpp:134] Memory required for data: 662257600 -I0906 13:58:58.221407 16537 layer_factory.hpp:74] Creating layer pool2 -I0906 13:58:58.221429 16537 net.cpp:91] Creating Layer pool2 -I0906 13:58:58.221436 16537 net.cpp:411] pool2 <- norm2 -I0906 13:58:58.221448 16537 net.cpp:369] pool2 -> pool2 -I0906 13:58:58.221460 16537 net.cpp:121] Setting up pool2 -I0906 13:58:58.221480 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:58.221484 16537 net.cpp:134] Memory required for data: 679563200 -I0906 13:58:58.221534 16537 layer_factory.hpp:74] Creating layer conv3 -I0906 13:58:58.221555 16537 net.cpp:91] Creating Layer conv3 -I0906 13:58:58.221561 16537 net.cpp:411] conv3 <- pool2 -I0906 13:58:58.221576 16537 net.cpp:369] conv3 -> conv3 -I0906 13:58:58.221592 16537 net.cpp:121] Setting up conv3 -I0906 13:58:58.338774 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:58.338798 16537 net.cpp:134] Memory required for data: 705521600 -I0906 13:58:58.338841 16537 layer_factory.hpp:74] Creating layer relu3 -I0906 13:58:58.338876 16537 net.cpp:91] Creating Layer relu3 -I0906 13:58:58.338891 16537 net.cpp:411] relu3 <- conv3 -I0906 13:58:58.338918 16537 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:58:58.338935 16537 net.cpp:121] Setting up relu3 -I0906 13:58:58.338944 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:58.338948 16537 net.cpp:134] Memory required for data: 731480000 -I0906 13:58:58.338953 16537 layer_factory.hpp:74] Creating layer conv4 -I0906 13:58:58.338979 16537 net.cpp:91] Creating Layer conv4 -I0906 13:58:58.338985 16537 net.cpp:411] conv4 <- conv3 -I0906 13:58:58.339002 16537 net.cpp:369] conv4 -> conv4 -I0906 13:58:58.339017 16537 net.cpp:121] Setting up conv4 -I0906 13:58:58.369153 16541 data_layer.cpp:120] Prefetch batch: 193 ms. -I0906 13:58:58.369201 16541 data_layer.cpp:121] Read time: 23.991 ms. -I0906 13:58:58.369210 16541 data_layer.cpp:122] Transform time: 167.322 ms. -I0906 13:58:58.426654 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:58.426676 16537 net.cpp:134] Memory required for data: 757438400 -I0906 13:58:58.426703 16537 layer_factory.hpp:74] Creating layer relu4 -I0906 13:58:58.426735 16537 net.cpp:91] Creating Layer relu4 -I0906 13:58:58.426749 16537 net.cpp:411] relu4 <- conv4 -I0906 13:58:58.426776 16537 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:58:58.426794 16537 net.cpp:121] Setting up relu4 -I0906 13:58:58.426802 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600) -I0906 13:58:58.426806 16537 net.cpp:134] Memory required for data: 783396800 -I0906 13:58:58.426811 16537 layer_factory.hpp:74] Creating layer conv5 -I0906 13:58:58.426838 16537 net.cpp:91] Creating Layer conv5 -I0906 13:58:58.426843 16537 net.cpp:411] conv5 <- conv4 -I0906 13:58:58.426858 16537 net.cpp:369] conv5 -> conv5 -I0906 13:58:58.426873 16537 net.cpp:121] Setting up conv5 -I0906 13:58:58.484124 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:58.484143 16537 net.cpp:134] Memory required for data: 800702400 -I0906 13:58:58.484182 16537 layer_factory.hpp:74] Creating layer relu5 -I0906 13:58:58.484212 16537 net.cpp:91] Creating Layer relu5 -I0906 13:58:58.484225 16537 net.cpp:411] relu5 <- conv5 -I0906 13:58:58.484251 16537 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:58:58.484266 16537 net.cpp:121] Setting up relu5 -I0906 13:58:58.484274 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400) -I0906 13:58:58.484278 16537 net.cpp:134] Memory required for data: 818008000 -I0906 13:58:58.484282 16537 layer_factory.hpp:74] Creating layer pool5 -I0906 13:58:58.484302 16537 net.cpp:91] Creating Layer pool5 -I0906 13:58:58.484308 16537 net.cpp:411] pool5 <- conv5 -I0906 13:58:58.484321 16537 net.cpp:369] pool5 -> pool5 -I0906 13:58:58.484335 16537 net.cpp:121] Setting up pool5 -I0906 13:58:58.484355 16537 net.cpp:128] Top shape: 100 256 6 6 (921600) -I0906 13:58:58.484359 16537 net.cpp:134] Memory required for data: 821694400 -I0906 13:58:58.484364 16537 layer_factory.hpp:74] Creating layer fc6 -I0906 13:58:58.484400 16537 net.cpp:91] Creating Layer fc6 -I0906 13:58:58.484405 16537 net.cpp:411] fc6 <- pool5 -I0906 13:58:58.484421 16537 net.cpp:369] fc6 -> fc6 -I0906 13:58:58.484434 16537 net.cpp:121] Setting up fc6 -I0906 13:59:03.394265 16537 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:59:03.394289 16537 net.cpp:134] Memory required for data: 823332800 -I0906 13:59:03.394316 16537 layer_factory.hpp:74] Creating layer relu6 -I0906 13:59:03.394362 16537 net.cpp:91] Creating Layer relu6 -I0906 13:59:03.394378 16537 net.cpp:411] relu6 <- fc6 -I0906 13:59:03.394405 16537 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:59:03.394472 16537 net.cpp:121] Setting up relu6 -I0906 13:59:03.394482 16537 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:59:03.394486 16537 net.cpp:134] Memory required for data: 824971200 -I0906 13:59:03.394492 16537 layer_factory.hpp:74] Creating layer fc7 -I0906 13:59:03.394515 16537 net.cpp:91] Creating Layer fc7 -I0906 13:59:03.394521 16537 net.cpp:411] fc7 <- fc6 -I0906 13:59:03.394537 16537 net.cpp:369] fc7 -> fc7 -I0906 13:59:03.394558 16537 net.cpp:121] Setting up fc7 -I0906 13:59:05.554731 16537 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:59:05.554755 16537 net.cpp:134] Memory required for data: 826609600 -I0906 13:59:05.554782 16537 layer_factory.hpp:74] Creating layer relu7 -I0906 13:59:05.554815 16537 net.cpp:91] Creating Layer relu7 -I0906 13:59:05.554829 16537 net.cpp:411] relu7 <- fc7 -I0906 13:59:05.554855 16537 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:59:05.554870 16537 net.cpp:121] Setting up relu7 -I0906 13:59:05.554879 16537 net.cpp:128] Top shape: 100 4096 (409600) -I0906 13:59:05.554883 16537 net.cpp:134] Memory required for data: 828248000 -I0906 13:59:05.554888 16537 layer_factory.hpp:74] Creating layer fc8 -I0906 13:59:05.554911 16537 net.cpp:91] Creating Layer fc8 -I0906 13:59:05.554916 16537 net.cpp:411] fc8 <- fc7 -I0906 13:59:05.554932 16537 net.cpp:369] fc8 -> fc8 -I0906 13:59:05.554946 16537 net.cpp:121] Setting up fc8 -I0906 13:59:06.080322 16537 net.cpp:128] Top shape: 100 1000 (100000) -I0906 13:59:06.080343 16537 net.cpp:134] Memory required for data: 828648000 -I0906 13:59:06.080370 16537 layer_factory.hpp:74] Creating layer loss -I0906 13:59:06.080420 16537 net.cpp:91] Creating Layer loss -I0906 13:59:06.080435 16537 net.cpp:411] loss <- fc8 -I0906 13:59:06.080457 16537 net.cpp:411] loss <- label -I0906 13:59:06.080476 16537 net.cpp:369] loss -> loss -I0906 13:59:06.080497 16537 net.cpp:121] Setting up loss -I0906 13:59:06.080515 16537 layer_factory.hpp:74] Creating layer loss -I0906 13:59:06.081025 16537 net.cpp:128] Top shape: (1) -I0906 13:59:06.081030 16537 net.cpp:130] with loss weight 1 -I0906 13:59:06.081048 16537 net.cpp:134] Memory required for data: 828648004 -I0906 13:59:06.081055 16537 net.cpp:193] loss needs backward computation. -I0906 13:59:06.081063 16537 net.cpp:193] fc8 needs backward computation. -I0906 13:59:06.081069 16537 net.cpp:193] relu7 needs backward computation. -I0906 13:59:06.081074 16537 net.cpp:193] fc7 needs backward computation. -I0906 13:59:06.081080 16537 net.cpp:193] relu6 needs backward computation. -I0906 13:59:06.081086 16537 net.cpp:193] fc6 needs backward computation. -I0906 13:59:06.081091 16537 net.cpp:193] pool5 needs backward computation. -I0906 13:59:06.081097 16537 net.cpp:193] relu5 needs backward computation. -I0906 13:59:06.081102 16537 net.cpp:193] conv5 needs backward computation. -I0906 13:59:06.081109 16537 net.cpp:193] relu4 needs backward computation. -I0906 13:59:06.081114 16537 net.cpp:193] conv4 needs backward computation. -I0906 13:59:06.081120 16537 net.cpp:193] relu3 needs backward computation. -I0906 13:59:06.081125 16537 net.cpp:193] conv3 needs backward computation. -I0906 13:59:06.081132 16537 net.cpp:193] pool2 needs backward computation. -I0906 13:59:06.081138 16537 net.cpp:193] norm2 needs backward computation. -I0906 13:59:06.081145 16537 net.cpp:193] relu2 needs backward computation. -I0906 13:59:06.081149 16537 net.cpp:193] conv2 needs backward computation. -I0906 13:59:06.081156 16537 net.cpp:193] pool1 needs backward computation. -I0906 13:59:06.081161 16537 net.cpp:193] norm1 needs backward computation. -I0906 13:59:06.081167 16537 net.cpp:193] relu1 needs backward computation. -I0906 13:59:06.081173 16537 net.cpp:193] conv1 needs backward computation. -I0906 13:59:06.081181 16537 net.cpp:195] data does not need backward computation. -I0906 13:59:06.081187 16537 net.cpp:236] This network produces output loss -I0906 13:59:06.081223 16537 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:59:06.081238 16537 net.cpp:248] Network initialization done. -I0906 13:59:06.081241 16537 net.cpp:249] Memory required for data: 828648004 -I0906 13:59:06.082168 16537 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt -I0906 13:59:06.082299 16537 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data -I0906 13:59:06.082527 16537 net.cpp:43] Initializing net from parameters: -name: "AlexNet" -state { - phase: TEST -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 227 - mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" - } - data_param { - source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 2 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "fc8" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} -I0906 13:59:06.082866 16537 net.cpp:68] Memory required for data: 0 -I0906 13:59:06.082913 16537 layer_factory.hpp:74] Creating layer data -I0906 13:59:06.082934 16537 net.cpp:91] Creating Layer data -I0906 13:59:06.082944 16537 net.cpp:369] data -> data -I0906 13:59:06.082967 16537 net.cpp:369] data -> label -I0906 13:59:06.082981 16537 net.cpp:121] Setting up data -I0906 13:59:06.082988 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto -I0906 13:59:06.091397 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb -I0906 13:59:06.091647 16537 data_layer.cpp:53] output data size: 50,3,227,227 -I0906 13:59:06.107939 16537 base_data_layer.cpp:43] Initializing prefetch -I0906 13:59:06.108054 16537 base_data_layer.cpp:45] Prefetch initialized. -I0906 13:59:06.108088 16537 net.cpp:128] Top shape: 50 3 227 227 (7729350) -I0906 13:59:06.108098 16537 net.cpp:128] Top shape: 50 (50) -I0906 13:59:06.108101 16537 net.cpp:134] Memory required for data: 30917600 -I0906 13:59:06.108135 16537 layer_factory.hpp:74] Creating layer label_data_1_split -I0906 13:59:06.108201 16537 net.cpp:91] Creating Layer label_data_1_split -I0906 13:59:06.108216 16537 net.cpp:411] label_data_1_split <- label -I0906 13:59:06.108259 16537 net.cpp:369] label_data_1_split -> label_data_1_split_0 -I0906 13:59:06.108306 16537 net.cpp:369] label_data_1_split -> label_data_1_split_1 -I0906 13:59:06.108319 16537 net.cpp:121] Setting up label_data_1_split -I0906 13:59:06.108353 16537 net.cpp:128] Top shape: 50 (50) -I0906 13:59:06.108361 16537 net.cpp:128] Top shape: 50 (50) -I0906 13:59:06.108364 16537 net.cpp:134] Memory required for data: 30918000 -I0906 13:59:06.108369 16537 layer_factory.hpp:74] Creating layer conv1 -I0906 13:59:06.108403 16537 net.cpp:91] Creating Layer conv1 -I0906 13:59:06.108409 16537 net.cpp:411] conv1 <- data -I0906 13:59:06.108425 16537 net.cpp:369] conv1 -> conv1 -I0906 13:59:06.108440 16537 net.cpp:121] Setting up conv1 -I0906 13:59:06.113059 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:59:06.113065 16537 net.cpp:134] Memory required for data: 88998000 -I0906 13:59:06.113085 16537 layer_factory.hpp:74] Creating layer relu1 -I0906 13:59:06.113097 16537 net.cpp:91] Creating Layer relu1 -I0906 13:59:06.113103 16537 net.cpp:411] relu1 <- conv1 -I0906 13:59:06.113116 16537 net.cpp:358] relu1 -> conv1 (in-place) -I0906 13:59:06.113126 16537 net.cpp:121] Setting up relu1 -I0906 13:59:06.113134 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:59:06.113138 16537 net.cpp:134] Memory required for data: 147078000 -I0906 13:59:06.113143 16537 layer_factory.hpp:74] Creating layer norm1 -I0906 13:59:06.113163 16537 net.cpp:91] Creating Layer norm1 -I0906 13:59:06.113169 16537 net.cpp:411] norm1 <- conv1 -I0906 13:59:06.113183 16537 net.cpp:369] norm1 -> norm1 -I0906 13:59:06.113193 16537 net.cpp:121] Setting up norm1 -I0906 13:59:06.113212 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000) -I0906 13:59:06.113255 16537 net.cpp:134] Memory required for data: 205158000 -I0906 13:59:06.113260 16537 layer_factory.hpp:74] Creating layer pool1 -I0906 13:59:06.113277 16537 net.cpp:91] Creating Layer pool1 -I0906 13:59:06.113282 16537 net.cpp:411] pool1 <- norm1 -I0906 13:59:06.113296 16537 net.cpp:369] pool1 -> pool1 -I0906 13:59:06.113306 16537 net.cpp:121] Setting up pool1 -I0906 13:59:06.113325 16537 net.cpp:128] Top shape: 50 96 27 27 (3499200) -I0906 13:59:06.113329 16537 net.cpp:134] Memory required for data: 219154800 -I0906 13:59:06.113334 16537 layer_factory.hpp:74] Creating layer conv2 -I0906 13:59:06.113348 16537 net.cpp:91] Creating Layer conv2 -I0906 13:59:06.113354 16537 net.cpp:411] conv2 <- pool1 -I0906 13:59:06.113369 16537 net.cpp:369] conv2 -> conv2 -I0906 13:59:06.113381 16537 net.cpp:121] Setting up conv2 -I0906 13:59:06.154265 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:59:06.154281 16537 net.cpp:134] Memory required for data: 256479600 -I0906 13:59:06.154316 16537 layer_factory.hpp:74] Creating layer relu2 -I0906 13:59:06.154345 16537 net.cpp:91] Creating Layer relu2 -I0906 13:59:06.154355 16537 net.cpp:411] relu2 <- conv2 -I0906 13:59:06.154374 16537 net.cpp:358] relu2 -> conv2 (in-place) -I0906 13:59:06.154387 16537 net.cpp:121] Setting up relu2 -I0906 13:59:06.154397 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:59:06.154400 16537 net.cpp:134] Memory required for data: 293804400 -I0906 13:59:06.154405 16537 layer_factory.hpp:74] Creating layer norm2 -I0906 13:59:06.154427 16537 net.cpp:91] Creating Layer norm2 -I0906 13:59:06.154433 16537 net.cpp:411] norm2 <- conv2 -I0906 13:59:06.154446 16537 net.cpp:369] norm2 -> norm2 -I0906 13:59:06.154463 16537 net.cpp:121] Setting up norm2 -I0906 13:59:06.154484 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200) -I0906 13:59:06.154503 16537 net.cpp:134] Memory required for data: 331129200 -I0906 13:59:06.154508 16537 layer_factory.hpp:74] Creating layer pool2 -I0906 13:59:06.154525 16537 net.cpp:91] Creating Layer pool2 -I0906 13:59:06.154531 16537 net.cpp:411] pool2 <- norm2 -I0906 13:59:06.154544 16537 net.cpp:369] pool2 -> pool2 -I0906 13:59:06.154556 16537 net.cpp:121] Setting up pool2 -I0906 13:59:06.154573 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:59:06.154578 16537 net.cpp:134] Memory required for data: 339782000 -I0906 13:59:06.154583 16537 layer_factory.hpp:74] Creating layer conv3 -I0906 13:59:06.154604 16537 net.cpp:91] Creating Layer conv3 -I0906 13:59:06.154610 16537 net.cpp:411] conv3 <- pool2 -I0906 13:59:06.154625 16537 net.cpp:369] conv3 -> conv3 -I0906 13:59:06.154638 16537 net.cpp:121] Setting up conv3 -I0906 13:59:06.204232 16545 data_layer.cpp:120] Prefetch batch: 96 ms. -I0906 13:59:06.204263 16545 data_layer.cpp:121] Read time: 12.163 ms. -I0906 13:59:06.204272 16545 data_layer.cpp:122] Transform time: 82.876 ms. -I0906 13:59:06.270438 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:59:06.270459 16537 net.cpp:134] Memory required for data: 352761200 -I0906 13:59:06.270499 16537 layer_factory.hpp:74] Creating layer relu3 -I0906 13:59:06.270532 16537 net.cpp:91] Creating Layer relu3 -I0906 13:59:06.270546 16537 net.cpp:411] relu3 <- conv3 -I0906 13:59:06.270571 16537 net.cpp:358] relu3 -> conv3 (in-place) -I0906 13:59:06.270587 16537 net.cpp:121] Setting up relu3 -I0906 13:59:06.270596 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:59:06.270601 16537 net.cpp:134] Memory required for data: 365740400 -I0906 13:59:06.270606 16537 layer_factory.hpp:74] Creating layer conv4 -I0906 13:59:06.270630 16537 net.cpp:91] Creating Layer conv4 -I0906 13:59:06.270637 16537 net.cpp:411] conv4 <- conv3 -I0906 13:59:06.270651 16537 net.cpp:369] conv4 -> conv4 -I0906 13:59:06.270666 16537 net.cpp:121] Setting up conv4 -I0906 13:59:06.357051 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:59:06.357074 16537 net.cpp:134] Memory required for data: 378719600 -I0906 13:59:06.357100 16537 layer_factory.hpp:74] Creating layer relu4 -I0906 13:59:06.357132 16537 net.cpp:91] Creating Layer relu4 -I0906 13:59:06.357184 16537 net.cpp:411] relu4 <- conv4 -I0906 13:59:06.357210 16537 net.cpp:358] relu4 -> conv4 (in-place) -I0906 13:59:06.357226 16537 net.cpp:121] Setting up relu4 -I0906 13:59:06.357235 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800) -I0906 13:59:06.357239 16537 net.cpp:134] Memory required for data: 391698800 -I0906 13:59:06.357244 16537 layer_factory.hpp:74] Creating layer conv5 -I0906 13:59:06.357270 16537 net.cpp:91] Creating Layer conv5 -I0906 13:59:06.357276 16537 net.cpp:411] conv5 <- conv4 -I0906 13:59:06.357292 16537 net.cpp:369] conv5 -> conv5 -I0906 13:59:06.357308 16537 net.cpp:121] Setting up conv5 -I0906 13:59:06.414666 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:59:06.414685 16537 net.cpp:134] Memory required for data: 400351600 -I0906 13:59:06.414727 16537 layer_factory.hpp:74] Creating layer relu5 -I0906 13:59:06.414757 16537 net.cpp:91] Creating Layer relu5 -I0906 13:59:06.414770 16537 net.cpp:411] relu5 <- conv5 -I0906 13:59:06.414794 16537 net.cpp:358] relu5 -> conv5 (in-place) -I0906 13:59:06.414808 16537 net.cpp:121] Setting up relu5 -I0906 13:59:06.414818 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200) -I0906 13:59:06.414820 16537 net.cpp:134] Memory required for data: 409004400 -I0906 13:59:06.414825 16537 layer_factory.hpp:74] Creating layer pool5 -I0906 13:59:06.414855 16537 net.cpp:91] Creating Layer pool5 -I0906 13:59:06.414860 16537 net.cpp:411] pool5 <- conv5 -I0906 13:59:06.414875 16537 net.cpp:369] pool5 -> pool5 -I0906 13:59:06.414888 16537 net.cpp:121] Setting up pool5 -I0906 13:59:06.414908 16537 net.cpp:128] Top shape: 50 256 6 6 (460800) -I0906 13:59:06.414912 16537 net.cpp:134] Memory required for data: 410847600 -I0906 13:59:06.414917 16537 layer_factory.hpp:74] Creating layer fc6 -I0906 13:59:06.414938 16537 net.cpp:91] Creating Layer fc6 -I0906 13:59:06.414944 16537 net.cpp:411] fc6 <- pool5 -I0906 13:59:06.414959 16537 net.cpp:369] fc6 -> fc6 -I0906 13:59:06.414971 16537 net.cpp:121] Setting up fc6 -I0906 13:59:11.292778 16537 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:59:11.292801 16537 net.cpp:134] Memory required for data: 411666800 -I0906 13:59:11.292829 16537 layer_factory.hpp:74] Creating layer relu6 -I0906 13:59:11.292860 16537 net.cpp:91] Creating Layer relu6 -I0906 13:59:11.292876 16537 net.cpp:411] relu6 <- fc6 -I0906 13:59:11.292902 16537 net.cpp:358] relu6 -> fc6 (in-place) -I0906 13:59:11.292918 16537 net.cpp:121] Setting up relu6 -I0906 13:59:11.292927 16537 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:59:11.292932 16537 net.cpp:134] Memory required for data: 412486000 -I0906 13:59:11.292937 16537 layer_factory.hpp:74] Creating layer fc7 -I0906 13:59:11.292958 16537 net.cpp:91] Creating Layer fc7 -I0906 13:59:11.292964 16537 net.cpp:411] fc7 <- fc6 -I0906 13:59:11.292980 16537 net.cpp:369] fc7 -> fc7 -I0906 13:59:11.292995 16537 net.cpp:121] Setting up fc7 -I0906 13:59:13.449043 16537 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:59:13.449066 16537 net.cpp:134] Memory required for data: 413305200 -I0906 13:59:13.449095 16537 layer_factory.hpp:74] Creating layer relu7 -I0906 13:59:13.449126 16537 net.cpp:91] Creating Layer relu7 -I0906 13:59:13.449141 16537 net.cpp:411] relu7 <- fc7 -I0906 13:59:13.449167 16537 net.cpp:358] relu7 -> fc7 (in-place) -I0906 13:59:13.449182 16537 net.cpp:121] Setting up relu7 -I0906 13:59:13.449192 16537 net.cpp:128] Top shape: 50 4096 (204800) -I0906 13:59:13.449195 16537 net.cpp:134] Memory required for data: 414124400 -I0906 13:59:13.449200 16537 layer_factory.hpp:74] Creating layer fc8 -I0906 13:59:13.449223 16537 net.cpp:91] Creating Layer fc8 -I0906 13:59:13.449229 16537 net.cpp:411] fc8 <- fc7 -I0906 13:59:13.449244 16537 net.cpp:369] fc8 -> fc8 -I0906 13:59:13.449270 16537 net.cpp:121] Setting up fc8 -I0906 13:59:13.974771 16537 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:59:13.974793 16537 net.cpp:134] Memory required for data: 414324400 -I0906 13:59:13.974820 16537 layer_factory.hpp:74] Creating layer fc8_fc8_0_split -I0906 13:59:13.974851 16537 net.cpp:91] Creating Layer fc8_fc8_0_split -I0906 13:59:13.974911 16537 net.cpp:411] fc8_fc8_0_split <- fc8 -I0906 13:59:13.974939 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0 -I0906 13:59:13.974962 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1 -I0906 13:59:13.974974 16537 net.cpp:121] Setting up fc8_fc8_0_split -I0906 13:59:13.974992 16537 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:59:13.974998 16537 net.cpp:128] Top shape: 50 1000 (50000) -I0906 13:59:13.975003 16537 net.cpp:134] Memory required for data: 414724400 -I0906 13:59:13.975006 16537 layer_factory.hpp:74] Creating layer accuracy -I0906 13:59:13.975038 16537 net.cpp:91] Creating Layer accuracy -I0906 13:59:13.975044 16537 net.cpp:411] accuracy <- fc8_fc8_0_split_0 -I0906 13:59:13.975054 16537 net.cpp:411] accuracy <- label_data_1_split_0 -I0906 13:59:13.975065 16537 net.cpp:369] accuracy -> accuracy -I0906 13:59:13.975076 16537 net.cpp:121] Setting up accuracy -I0906 13:59:13.975092 16537 net.cpp:128] Top shape: (1) -I0906 13:59:13.975096 16537 net.cpp:134] Memory required for data: 414724404 -I0906 13:59:13.975101 16537 layer_factory.hpp:74] Creating layer loss -I0906 13:59:13.975112 16537 net.cpp:91] Creating Layer loss -I0906 13:59:13.975117 16537 net.cpp:411] loss <- fc8_fc8_0_split_1 -I0906 13:59:13.975128 16537 net.cpp:411] loss <- label_data_1_split_1 -I0906 13:59:13.975139 16537 net.cpp:369] loss -> loss -I0906 13:59:13.975150 16537 net.cpp:121] Setting up loss -I0906 13:59:13.975160 16537 layer_factory.hpp:74] Creating layer loss -I0906 13:59:13.975487 16537 net.cpp:128] Top shape: (1) -I0906 13:59:13.975492 16537 net.cpp:130] with loss weight 1 -I0906 13:59:13.975507 16537 net.cpp:134] Memory required for data: 414724408 -I0906 13:59:13.975513 16537 net.cpp:193] loss needs backward computation. -I0906 13:59:13.975520 16537 net.cpp:195] accuracy does not need backward computation. -I0906 13:59:13.975528 16537 net.cpp:193] fc8_fc8_0_split needs backward computation. -I0906 13:59:13.975533 16537 net.cpp:193] fc8 needs backward computation. -I0906 13:59:13.975538 16537 net.cpp:193] relu7 needs backward computation. -I0906 13:59:13.975544 16537 net.cpp:193] fc7 needs backward computation. -I0906 13:59:13.975549 16537 net.cpp:193] relu6 needs backward computation. -I0906 13:59:13.975555 16537 net.cpp:193] fc6 needs backward computation. -I0906 13:59:13.975560 16537 net.cpp:193] pool5 needs backward computation. -I0906 13:59:13.975566 16537 net.cpp:193] relu5 needs backward computation. -I0906 13:59:13.975572 16537 net.cpp:193] conv5 needs backward computation. -I0906 13:59:13.975577 16537 net.cpp:193] relu4 needs backward computation. -I0906 13:59:13.975582 16537 net.cpp:193] conv4 needs backward computation. -I0906 13:59:13.975589 16537 net.cpp:193] relu3 needs backward computation. -I0906 13:59:13.975594 16537 net.cpp:193] conv3 needs backward computation. -I0906 13:59:13.975600 16537 net.cpp:193] pool2 needs backward computation. -I0906 13:59:13.975605 16537 net.cpp:193] norm2 needs backward computation. -I0906 13:59:13.975611 16537 net.cpp:193] relu2 needs backward computation. -I0906 13:59:13.975616 16537 net.cpp:193] conv2 needs backward computation. -I0906 13:59:13.975622 16537 net.cpp:193] pool1 needs backward computation. -I0906 13:59:13.975628 16537 net.cpp:193] norm1 needs backward computation. -I0906 13:59:13.975635 16537 net.cpp:193] relu1 needs backward computation. -I0906 13:59:13.975639 16537 net.cpp:193] conv1 needs backward computation. -I0906 13:59:13.975646 16537 net.cpp:195] label_data_1_split does not need backward computation. -I0906 13:59:13.975654 16537 net.cpp:195] data does not need backward computation. -I0906 13:59:13.975658 16537 net.cpp:236] This network produces output accuracy -I0906 13:59:13.975664 16537 net.cpp:236] This network produces output loss -I0906 13:59:13.975702 16537 net.cpp:483] Collecting Learning Rate and Weight Decay. -I0906 13:59:13.975714 16537 net.cpp:248] Network initialization done. -I0906 13:59:13.975718 16537 net.cpp:249] Memory required for data: 414724408 -I0906 13:59:13.975903 16537 solver.cpp:53] Solver scaffolding done. -I0906 13:59:13.976030 16537 solver.cpp:270] Solving AlexNet -I0906 13:59:13.976050 16537 solver.cpp:271] Learning Rate Policy: step -I0906 13:59:13.977635 16537 solver.cpp:314] Iteration 0, Testing net (#0) -I0906 13:59:13.977653 16537 net.cpp:696] Copying source layer data -I0906 13:59:13.977660 16537 net.cpp:696] Copying source layer conv1 -I0906 13:59:13.980556 16537 net.cpp:696] Copying source layer relu1 -I0906 13:59:13.980595 16537 net.cpp:696] Copying source layer norm1 -I0906 13:59:13.980607 16537 net.cpp:696] Copying source layer pool1 -I0906 13:59:13.980617 16537 net.cpp:696] Copying source layer conv2 -I0906 13:59:13.980785 16537 net.cpp:696] Copying source layer relu2 -I0906 13:59:13.980798 16537 net.cpp:696] Copying source layer norm2 -I0906 13:59:13.980808 16537 net.cpp:696] Copying source layer pool2 -I0906 13:59:13.980818 16537 net.cpp:696] Copying source layer conv3 -I0906 13:59:13.981422 16537 net.cpp:696] Copying source layer relu3 -I0906 13:59:13.981437 16537 net.cpp:696] Copying source layer conv4 -I0906 13:59:13.982098 16537 net.cpp:696] Copying source layer relu4 -I0906 13:59:13.982115 16537 net.cpp:696] Copying source layer conv5 -I0906 13:59:13.982612 16537 net.cpp:696] Copying source layer relu5 -I0906 13:59:13.982626 16537 net.cpp:696] Copying source layer pool5 -I0906 13:59:13.982636 16537 net.cpp:696] Copying source layer fc6 -I0906 13:59:13.993058 16537 net.cpp:696] Copying source layer relu6 -I0906 13:59:13.993091 16537 net.cpp:696] Copying source layer fc7 -I0906 13:59:13.997967 16537 net.cpp:696] Copying source layer relu7 -I0906 13:59:13.997984 16537 net.cpp:696] Copying source layer fc8 -I0906 13:59:13.998755 16537 net.cpp:696] Copying source layer loss -I0906 13:59:13.998867 16537 base_data_layer.cpp:89] Thread joined -I0906 13:59:14.003283 16537 base_data_layer.cpp:93] Prefetch copied -I0906 13:59:14.003650 16537 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:59:14.096194 16546 data_layer.cpp:120] Prefetch batch: 92 ms. -I0906 13:59:14.096225 16546 data_layer.cpp:121] Read time: 12.131 ms. -I0906 13:59:14.096233 16546 data_layer.cpp:122] Transform time: 79.106 ms. -I0906 13:59:17.032117 16537 solver.cpp:363] Test net output #0: accuracy = 0 -I0906 13:59:17.032146 16537 solver.cpp:363] Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss) -I0906 13:59:17.032196 16537 base_data_layer.cpp:89] Thread joined -I0906 13:59:17.041095 16537 base_data_layer.cpp:93] Prefetch copied -I0906 13:59:17.041471 16537 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:59:17.232076 16547 data_layer.cpp:120] Prefetch batch: 190 ms. -I0906 13:59:17.232108 16547 data_layer.cpp:121] Read time: 24.399 ms. -I0906 13:59:17.232116 16547 data_layer.cpp:122] Transform time: 164.272 ms. -I0906 13:59:23.802855 16537 solver.cpp:234] Iteration 0, loss = 0 -I0906 13:59:23.802914 16537 solver.cpp:249] Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss) -I0906 13:59:23.802963 16537 solver.cpp:506] Iteration 0, lr = 0.01 -I0906 13:59:23.918314 16537 base_data_layer.cpp:89] Thread joined -I0906 13:59:23.926301 16537 base_data_layer.cpp:93] Prefetch copied -I0906 13:59:23.926447 16537 base_data_layer.cpp:104] CreatePrefetchThread -I0906 13:59:24.110566 16549 data_layer.cpp:120] Prefetch batch: 183 ms. -I0906 13:59:24.110599 16549 data_layer.cpp:121] Read time: 23.839 ms. -I0906 13:59:24.110605 16549 data_layer.cpp:122] Transform time: 158.415 ms. -I0906 13:59:26.694295 16537 solver.cpp:234] Iteration 1, loss = 0 diff --git a/log/caffe.INFO b/log/caffe.INFO deleted file mode 120000 index 65520a80..00000000 --- a/log/caffe.INFO +++ /dev/null @@ -1 +0,0 @@ -caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 \ No newline at end of file From 5698e3c53d3f9e9010a9375174d1fe69c5b583cc Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 6 Sep 2015 16:59:36 +0800 Subject: [PATCH 059/124] Ported hdf5_data hdf5_output log and mvn layer --- include/caffe/util/ocl_wrapper.hpp | 21 ++++ src/caffe/layers/hdf5_data_layer.cpp | 27 +++++ src/caffe/layers/hdf5_output_layer.cpp | 16 +++ src/caffe/layers/log_layer.cpp | 35 +++++++ src/caffe/layers/mvn_layer.cpp | 104 +++++++++++++++++++ src/caffe/layers/relu_layer.cl | 22 ---- src/caffe/ocl/util.cl | 77 ++++++++++++++ src/caffe/util/math_functions.cpp | 74 +++++++++----- src/caffe/util/ocl_wrapper.cpp | 133 +++++++++++++++++++++++++ 9 files changed, 464 insertions(+), 45 deletions(-) delete mode 100644 src/caffe/layers/relu_layer.cl diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index de188e11..90d22752 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -146,6 +146,27 @@ void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_max, Dtype* data); +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out); + +template +void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_log(const int count, const Dtype* data, Dtype* out); + +template +void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_add_scalar(const int count, const Dtype data, Dtype* out); + template void kernel_exp(const int count, const Dtype* data, Dtype* out); diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 649dc020..dda29aee 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -160,6 +160,33 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + current_file_ += 1; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + caffe_copy(data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); + } + } } diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 7d1ca097..bd608e86 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -70,6 +70,22 @@ void HDF5OutputLayer::Backward_cpu(const vector*>& top, template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + + for (int i = 0; i < bottom[0]->num(); ++i) { + caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim], + &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim], + &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + } + SaveBlobs(); } template diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 9d3977a7..461fd9bf 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -80,11 +80,46 @@ void LogLayer::Backward_cpu(const vector*>& top, template void LogLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_gpu_log(count, bottom_data, top_data); + } else { + caffe_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, top_data); + } + caffe_gpu_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_gpu_scal(count, base_scale_, top_data); + } } template void LogLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (!propagate_down[0]) { return; } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, bottom_diff); + } + caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_gpu_scal(count, backward_num_scale_, bottom_diff); + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 84701831..cbeeb150 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -137,11 +137,115 @@ void MVNLayer::Backward_cpu(const vector*>& top, template void MVNLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), + sum_multiplier_.gpu_data(), 0., + variance_.mutable_gpu_data()); // E(X^2) + caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), + temp_.mutable_gpu_data()); // (EX)^2 + caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), + variance_.mutable_gpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + + // normalize variance + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); + } else { + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + + // subtract mean + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + } } template void MVNLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + bottom_diff); + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., + bottom_diff); + + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); + } else { + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, top_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); + } } diff --git a/src/caffe/layers/relu_layer.cl b/src/caffe/layers/relu_layer.cl deleted file mode 100644 index cebe24cd..00000000 --- a/src/caffe/layers/relu_layer.cl +++ /dev/null @@ -1,22 +0,0 @@ -template -__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ - int index = get_global_id(0); - if(index < count) - out[index] = in[index] > 0? in[index]:in[index]*negative_slope; -} - -//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); -template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); - -template -__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ - int index = get_global_id(0); - if(index < count) - out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; -} - -template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); -template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); - - diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index cda05652..7c907058 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -90,6 +90,62 @@ __kernel void exp (const int num, __global T* data, __global T* out){ template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); +template +__kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] - b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_sub(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] + b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_add(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] / b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_div(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] * b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out); + + +template +__kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = pow(data[index], alpha); + } +} + +template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out); +template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out); + template __kernel void kernel_exp(const int count, __global const T* data, __global T* out) { @@ -102,6 +158,27 @@ __kernel void kernel_exp(const int count, __global const T* data, __global T* ou template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); +template +__kernel void kernel_add_scalar(const int count, const T data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = out[index] + data; + } +} + +template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out); +template __attribute__ ((mangled_name(kernel_add_scalar__double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out); + +template +__kernel void kernel_log(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = log(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out); template __kernel void diff (const int num, const int dim, __global T* data, __global T* label){ diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 6b2276ca..34442442 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -653,31 +653,24 @@ void caffe_gpu_set(const int N, const double alpha, double* Y) { } } -template -void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { -} - template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { + kernel_add_scalar(N, alpha, Y); } template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { -} - -template -void mul_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { + kernel_add_scalar(N, alpha, Y); } template <> void caffe_gpu_exp(const int N, const float* a, float* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } template <> void caffe_gpu_exp(const int N, const double* a, double* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } template<> @@ -690,9 +683,24 @@ void caffe_gpu_sign(const int N, const double *X, double *Y){ caffe_gpu_sign_ocl(N, X, Y); } +template <> +void caffe_gpu_sub(const int N, const float* a, const float* b, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); +} + +template <> +void caffe_gpu_sub(const int N, const double* a, const double* b, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); +} + template <> void caffe_gpu_mul(const int N, const float* a, const float* b, float* y) { + kernel_mul(N, a, b, y); } template <> @@ -700,26 +708,31 @@ void caffe_gpu_mul(const int N, const double* a, const double* b, double* y) { } -template -void div_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { -} - template <> void caffe_gpu_div(const int N, const float* a, const float* b, float* y) { + kernel_div(N, a, b, y); } template <> void caffe_gpu_div(const int N, const double* a, const double* b, double* y) { + kernel_div(N, a, b, y); } -template -void powx_kernel(const int n, const Dtype* a, - const Dtype alpha, Dtype* y) { +template <> +void caffe_gpu_powx(const int N, const float* a, + const float alpha, float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); } +template <> +void caffe_gpu_powx(const int N, const double* a, + const double alpha, double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); +} void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) { @@ -763,6 +776,23 @@ void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) { } +template <> +void caffe_gpu_log(const int N, const float* a, float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); +} + +template <> +void caffe_gpu_log(const int N, const double* a, double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); +} + + + + + + template <> void caffe_log(const int n, const float* a, float* y) { vsLn(n, a, y); @@ -809,16 +839,14 @@ template <> void caffe_gpu_add(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - // add_kernel<<>>( - // N, a, b, y); + kernel_add(N, a, b, y); } template <> void caffe_gpu_add(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - // add_kernel<<>>( - // N, a, b, y); + kernel_add(N, a, b, y); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index f4b43acf..14caf874 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -205,6 +205,139 @@ template void kernel_channel_subtract( const int count, const int num, const int channels, const int spatial_dim, const double* channel_max, double* data); +template +void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) +{ + std::string kernel_name = "kernel_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_mul(const int count, const float* a, const float* b, float* out); +template void kernel_mul(const int count, const double* a, const double* b, double* out); + +template +void kernel_add_scalar(const int count, const Dtype data, Dtype* out) +{ + std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_add_scalar(const int count, const float data, float* out); +template void kernel_add_scalar(const int count, const double data, double* out); + + +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out) +{ + std::string kernel_name = "kernel_powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_powx(const int count, const float* data, const float alpha, float* out); +template void kernel_powx(const int count, const double* data, const double alpha, double* out); + +template +void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) +{ + std::string kernel_name = "kernel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_div(const int count, const float* a, const float* b, float* out); +template void kernel_div(const int count, const double* a, const double* b, double* out); + +template +void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) +{ + std::string kernel_name = "kernel_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_add(const int count, const float* a, const float* b, float* out); +template void kernel_add(const int count, const double* a, const double* b, double* out); + +template +void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) +{ + std::string kernel_name = "kernel_sub" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); + OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_sub(const int count, const float* a, const float* b, float* out); +template void kernel_sub(const int count, const double* a, const double* b, double* out); + +template +void kernel_log(const int count, const Dtype* data, Dtype* out) +{ + std::string kernel_name = "kernel_log" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); + OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); + OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + + size_t Global_Work_Size[1] = {(size_t)count}; + size_t Local_Work_Size[1] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); +} + +template void kernel_log(const int count, const float* data, float* out); +template void kernel_log(const int count, const double* data, double* out); + + template void kernel_exp(const int count, const Dtype* data, Dtype* out) { From 0ccf6587350ad1084730643a3b322040f92698cd Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 6 Sep 2015 19:54:30 +0800 Subject: [PATCH 060/124] Port absval_layer bnll_layer concat_layer contrastive_loss_layer deconv_layer eltwise_layer euclidean_loss_layer exp_layer & filter_layer --- include/caffe/common.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 25 ++++ src/caffe/layers/absval_layer.cpp | 11 ++ src/caffe/layers/bnll_layer.cpp | 18 ++- src/caffe/layers/concat_layer.cpp | 32 +++- src/caffe/layers/contrastive_loss_layer.cpp | 62 +++++++- src/caffe/layers/deconv_layer.cpp | 46 +++++- src/caffe/layers/eltwise_layer.cpp | 78 +++++++++- src/caffe/layers/euclidean_loss_layer.cpp | 28 +++- src/caffe/layers/filter_layer.cpp | 54 ++++++- src/caffe/ocl/bnll_layer.cl | 26 ++++ src/caffe/ocl/concat_layer.cl | 28 ++++ src/caffe/ocl/contrastive_loss_layer.cl | 38 +++++ src/caffe/ocl/eltwise_layer.cl | 47 ++++++ src/caffe/util/ocl_wrapper.cpp | 155 ++++++++++++++++++++ 15 files changed, 629 insertions(+), 21 deletions(-) create mode 100644 src/caffe/ocl/bnll_layer.cl create mode 100644 src/caffe/ocl/concat_layer.cl create mode 100644 src/caffe/ocl/contrastive_loss_layer.cl create mode 100644 src/caffe/ocl/eltwise_layer.cl diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 8c738ca3..c5bf909d 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -81,7 +81,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 0 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 90d22752..d644d16a 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -223,6 +223,31 @@ void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y); template void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data); + +template +void CLLBackward(const int count, const int channels, + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff); + +template +void MaxForward(const int nthreads, const Dtype* bottom_data_a, + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask); + +template +void MaxBackward(const int nthreads, const Dtype* top_diff, + const int blob_idx, const int* mask, Dtype* bottom_diff); } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ // namespace caffe diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 30422737..12776eb8 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -38,11 +38,22 @@ void AbsValLayer::Backward_cpu(const vector*>& top, template void AbsValLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); } template void AbsValLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_sign(count, bottom_data, bottom_diff); + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 09e2bc89..3fe6f42e 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -40,12 +40,26 @@ void BNLLLayer::Backward_cpu(const vector*>& top, template void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLForward(count, bottom_data, top_data); } template void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLBackward(count, top_diff, bottom_data, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 6af287a9..d1d0e927 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -90,14 +90,42 @@ void ConcatLayer::Backward_cpu(const vector*>& top, template void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + Dtype* top_data = top[0]->mutable_gpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = true; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); + offset_concat_axis += bottom_concat_axis; + } } template void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = false; + for (int i = 0; i < bottom.size(); ++i) { + if (!propagate_down[i]) { continue; } + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + offset_concat_axis += bottom_concat_axis; + } } + #ifdef CPU_ONLY STUB_GPU(ConcatLayer); #endif diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index aad4cab3..4b47eb42 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -112,13 +112,69 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } template -void ContrastiveLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ +void ContrastiveLossLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + const int count = bottom[0]->count(); + caffe_gpu_sub( + count, + bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i + caffe_gpu_powx( + count, + diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), + diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + caffe_gpu_gemv( + CblasNoTrans, + bottom[0]->num(), + bottom[0]->channels(), + Dtype(1.0), + diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), + Dtype(0.0), + dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist*dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const int count = bottom[0]->count(); + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + const bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / + static_cast(bottom[0]->num()); + // NOLINT_NEXT_LINE(whitespace/operators) + CLLBackward(count, channels, margin, legacy_version, alpha, + bottom[2]->gpu_data(), // pair similarity 0 or 1 + diff_.gpu_data(), // the cached eltwise difference between a and b + dist_sq_.gpu_data(), // the cached square distance between a and b + bottom[i]->mutable_gpu_diff()); + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index e8937238..ad9a690e 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -71,16 +71,56 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, + top_data + top[i]->offset(n)); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data + top[i]->offset(n), bias); + } + } + } } template void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(top_diff + top[i]->offset(n), + bottom_data + bottom[i]->offset(n), weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n)); + } + } + } + } } - #ifdef CPU_ONLY STUB_GPU(DeconvolutionLayer); #endif diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index cffc743d..61417f8c 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -153,16 +153,88 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, template void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + int* mask = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_gpu_set(count, Dtype(0.), top_data); + // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.mutable_gpu_data(); + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); + for (int i = 2; i < bottom.size(); ++i) { + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } } template void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { continue; } + if (!initialized) { + caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); + initialized = true; + } else { + caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, + bottom_diff); + } + } + } else { + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + } + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1.)) { + caffe_copy(count, top_diff, bottom_diff); + } else { + caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.gpu_data(); + MaxBackward(count, top_diff, i, mask, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } } - #ifdef CPU_ONLY STUB_GPU(EltwiseLayer); #endif diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 9c37c18b..d1efe5bb 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -49,16 +49,36 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + int count = bottom[0]->count(); + caffe_gpu_sub( + count, + bottom[0]->gpu_data(), + bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); + Dtype dot; + caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_gpu_axpby( + bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b + } + } } - - #ifdef CPU_ONLY STUB_GPU(EuclideanLossLayer); #endif diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index 4d004ad4..c5f5e4dd 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -119,15 +119,63 @@ void FilterLayer::Backward_cpu(const vector*>& top, template void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->gpu_data(); + Dtype* top_data = top[t]->mutable_gpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * dim; + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } } template void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); ++i) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); ++n) { + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + data_offset_bottom = n * dim; + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + data_offset_bottom = n * dim; + if (n != batch_offset) { // this data was not been forwarded + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + ++next_to_backward_offset; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } + } + } + } + } } - #ifdef CPU_ONLY STUB_GPU(FilterLayer); #endif diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl new file mode 100644 index 00000000..c297db75 --- /dev/null +++ b/src/caffe/ocl/bnll_layer.cl @@ -0,0 +1,26 @@ +#define kBNLL_THRESHOLD 50.0 + +template +__kernel void BNLLForward(const int n, __global const T* in, __global T* out) { + int index = get_global_id(0); + if (index < n) { + out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); + } +} +template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out); +template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out); + +template +__kernel void BNLLBackward(const int n, __global const T* in_diff, + __global const T* in_data, __global T* out_diff) { + int index = get_global_id(0); + if (index < n) { + T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); + out_diff[index] = in_diff[index] * expval / (expval + 1.); + } +} + +template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff, + __global const float* in_data, __global float* out_diff); +template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff, + __global const double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl new file mode 100644 index 00000000..dfcbfbc5 --- /dev/null +++ b/src/caffe/ocl/concat_layer.cl @@ -0,0 +1,28 @@ +template +__kernel void Concat(const int nthreads, __global const T* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } +} + +template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); +template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl new file mode 100644 index 00000000..5a67e399 --- /dev/null +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -0,0 +1,38 @@ +template +__kernel void CLLBackward(const int count, const int channels, + const Dtype margin, const bool legacy_version, const Dtype alpha, + __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, + __global Dtype *bottom_diff) { + int i = get_global_id(0); + if(i < count) { + int n = i / channels; // the num index, to access y and dist_sq + if (static_cast(y[n])) { // similar pairs + bottom_diff[i] = alpha * diff[i]; + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = (margin - dist_sq[n]); + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq[n]); + mdist = (margin - dist); + beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; + } + if (mdist > 0.0) { + bottom_diff[i] = beta; + } else { + bottom_diff[i] = 0; + } + } + } +} + +template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels, + const float margin, const bool legacy_version, const float alpha, + __global const float* y, __global const float* diff, __global const float* dist_sq, + __global float *bottom_diff); +template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels, + const double margin, const bool legacy_version, const double alpha, + __global const double* y, __global const double* diff, __global const double* dist_sq, + __global double *bottom_diff); diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl new file mode 100644 index 00000000..1be9cb43 --- /dev/null +++ b/src/caffe/ocl/eltwise_layer.cl @@ -0,0 +1,47 @@ +template +__kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, + __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, + __global int* mask) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype maxval = -FLT_MAX; + int maxidx = -1; + if (bottom_data_a[index] > bottom_data_b[index]) { + // only update for very first bottom_data blob (blob_idx == 0) + if (blob_idx == 0) { + maxval = bottom_data_a[index]; + top_data[index] = maxval; + maxidx = blob_idx; + mask[index] = maxidx; + } + } else { + maxval = bottom_data_b[index]; + top_data[index] = maxval; + maxidx = blob_idx + 1; + mask[index] = maxidx; + } + } +} +template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a, + __global const float* bottom_data_b, const int blob_idx, __global float* top_data, + __global int* mask); +template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a, + __global const double* bottom_data_b, const int blob_idx, __global double* top_data, + __global int* mask); + +template +__kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, + const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype gradient = 0; + if (mask[index] == blob_idx) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } +} +template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff, + const int blob_idx, __global const int* mask, __global float* bottom_diff); +template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff, + const int blob_idx, __global const int* mask, __global double* bottom_diff); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 14caf874..f2897538 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1209,6 +1209,160 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) +{ + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLForward(const int count, const float* bottom_data, float *top_data); +template void BNLLForward(const int count, const double* bottom_data, double *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff) +{ + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(kernel,3,sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLBackward(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff); +template void BNLLBackward(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff); + + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data) +{ + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*)&forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)nthreads}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Concat(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data); +template void Concat(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data); + +template +void CLLBackward(const int count, const int channels, + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff) +{ + std::string kernel_name = "CLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&channels); + ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*)&margin); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*)&legacy_version); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&y); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&diff); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&dist_sq); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void CLLBackward(const int count, const int channels, + const float margin, const bool legacy_version, const float alpha, + const float* y, const float* diff, const float* dist_sq, + float *bottom_diff); +template void CLLBackward(const int count, const int channels, + const double margin, const bool legacy_version, const double alpha, + const double* y, const double* diff, const double* dist_sq, + double *bottom_diff); + +template +void MaxForward(const int nthreads, const Dtype* bottom_data_a, + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask) +{ + std::string kernel_name = "MaxForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&bottom_data_a); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&bottom_data_b); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&blob_idx); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)nthreads}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void MaxForward(const int nthreads, const float* bottom_data_a, + const float* bottom_data_b, const int blob_idx, float* top_data, + int* mask); +template void MaxForward(const int nthreads, const double* bottom_data_a, + const double* bottom_data_b, const int blob_idx, double* top_data, + int* mask); + +template +void MaxBackward(const int nthreads, const Dtype* top_diff, + const int blob_idx, const int* mask, Dtype* bottom_diff) +{ + std::string kernel_name = "MaxBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&blob_idx); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)nthreads}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void MaxBackward(const int nthreads, const float* top_diff, const int blob_idx, const int* mask, float* bottom_diff); +template void MaxBackward(const int nthreads, const double* top_diff, const int blob_idx, const int* mask, double* bottom_diff); + + template void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) { @@ -1216,4 +1370,5 @@ void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, template void ocl_conv(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); + } // namespace caffe From dfc6cb133c929a173d5d34a1c367bb3e4af136bf Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 6 Sep 2015 22:31:19 +0800 Subject: [PATCH 061/124] Fix some bugs in layers' porting --- src/caffe/common.cpp | 2 ++ src/caffe/layers/deconv_layer.cpp | 11 +++++--- src/caffe/layers/im2col_layer.cpp | 20 ++++++++++++-- src/caffe/layers/softmax_loss_layer.cpp | 2 +- src/caffe/ocl/bnll_layer.cl | 26 +++++++++++++++++++ src/caffe/ocl/concat_layer.cl | 26 +++++++++++++++++++ src/caffe/ocl/contrastive_loss_layer.cl | 26 +++++++++++++++++++ src/caffe/ocl/eltwise_layer.cl | 26 +++++++++++++++++++ src/caffe/ocl/util.cl | 2 +- .../test/test_data/generate_sample_data.py | 0 src/caffe/util/math_functions.cpp | 18 +++++++++++++ src/caffe/util/ocl_wrapper.cpp | 24 ++++++++--------- 12 files changed, 164 insertions(+), 19 deletions(-) mode change 100644 => 100755 src/caffe/test/test_data/generate_sample_data.py diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index e12c48c9..c1d26ab8 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -104,6 +104,8 @@ Caffe::~Caffe() { } void Caffe::set_random_seed(const unsigned int seed) { + // RNG seed + Get().random_generator_.reset(new RNG(seed)); } void Caffe::SetDevice(const int device_id) { diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index ad9a690e..4b952c73 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -77,11 +77,12 @@ void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* top_data = top[i]->mutable_gpu_data(); for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_gemm(bottom_data, weight, top_data); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); + this->forward_gpu_bias(top_data, bias); } } } @@ -100,11 +101,15 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); } } if (this->param_propagate_down_[0] || propagate_down[i]) { for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm(top_diff + top[i]->offset(n), diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index ddf6c989..7b667172 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -87,14 +87,30 @@ void Im2colLayer::Backward_cpu(const vector*>& top, template void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data, top[0]->offset(n)); + } } template void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); + } } + + #ifdef CPU_ONLY STUB_GPU(Im2colLayer); #endif diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 22456302..d8db1797 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -152,7 +152,6 @@ void SoftmaxWithLossLayer::Forward_gpu( outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); Dtype loss; caffe_gpu_asum(nthreads, loss_data, &loss); - printf("loss = %f\n", loss); if (normalize_) { Dtype count; caffe_gpu_asum(nthreads, counts, &count); @@ -160,6 +159,7 @@ void SoftmaxWithLossLayer::Forward_gpu( } else { loss /= outer_num_; } + printf("loss = %f\n", loss); top[0]->mutable_cpu_data()[0] = loss; if (top.size() == 2) { top[1]->ShareData(prob_); diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl index c297db75..03ddba8a 100644 --- a/src/caffe/ocl/bnll_layer.cl +++ b/src/caffe/ocl/bnll_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #define kBNLL_THRESHOLD 50.0 template diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl index dfcbfbc5..71eb8c77 100644 --- a/src/caffe/ocl/concat_layer.cl +++ b/src/caffe/ocl/concat_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void Concat(const int nthreads, __global const T* in_data, const bool forward, const int num_concats, const int concat_size, diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl index 5a67e399..8ed18ce4 100644 --- a/src/caffe/ocl/contrastive_loss_layer.cl +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void CLLBackward(const int count, const int channels, const Dtype margin, const bool legacy_version, const Dtype alpha, diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl index 1be9cb43..d843884a 100644 --- a/src/caffe/ocl/eltwise_layer.cl +++ b/src/caffe/ocl/eltwise_layer.cl @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + template __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index 7c907058..d15f168c 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -167,7 +167,7 @@ __kernel void kernel_add_scalar(const int count, const T data, __global T* out) } template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out); -template __attribute__ ((mangled_name(kernel_add_scalar__double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out); +template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out); template __kernel void kernel_log(const int count, __global const T* data, __global T* out) { diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py old mode 100644 new mode 100755 diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 34442442..787f2b16 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -544,6 +544,12 @@ double caffe_cpu_dot(const int n, const double* x, const double* y) { template <> void caffe_gpu_dot(const int n, const float* x, const float* y, float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL); + clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); } template <> @@ -551,6 +557,12 @@ void caffe_gpu_dot(const int n, const double* x, const double* y, double * out) { //need to pass in scratchBuff //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(double)), NULL, NULL); + clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), out,0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); } template <> @@ -597,6 +609,12 @@ void caffe_gpu_asum(const int n, const float* x, float* y) { template <> void caffe_gpu_asum(const int n, const double* x, double* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_double)), NULL, NULL); + clblasDasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), y,0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index f2897538..b479ddff 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -966,8 +966,8 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height); ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width); ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LFSkernel,7,sizeof(cl_float),(void*)&alpha_over_size); - ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k); + ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size); + ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k); ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); OCL_CHECK(ret); size_t uiGlobal_Work_Size[]={(size_t)nthreads}; @@ -990,7 +990,7 @@ void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale); - ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta); + ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta); ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); OCL_CHECK(ret); size_t uiGlobal_Work_Size2[]={(size_t)nthreads}; @@ -1020,8 +1020,8 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height); ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width); ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LCDkernel,10,sizeof(cl_float),(void*)&negative_beta); - ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio); + ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta); + ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio); ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); OCL_CHECK(ret); size_t uiGlobal_Work_Size[]={(size_t)nthreads}; @@ -1117,7 +1117,7 @@ void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); OCL_CHECK(ret); size_t Global_Work_Size[] = {(size_t)n}; @@ -1175,7 +1175,7 @@ void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMe ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); - ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); + ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_); ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); OCL_CHECK(ret); @@ -1198,7 +1198,7 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); - ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); + ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_); ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); OCL_CHECK(ret); @@ -1263,10 +1263,10 @@ void Concat(const int nthreads, const Dtype* in_data, const bool forward, const ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*)&forward); ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&num_concats); ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&concat_size); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&top_concat_axis); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&bottom_concat_axis); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&offset_concat_axis); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&out_data); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&out_data); OCL_CHECK(ret); size_t Global_Work_Size[] = {(size_t)nthreads}; From 84d80c2e5a2244d2dcef0583fdb52e76f772bdb4 Mon Sep 17 00:00:00 2001 From: Noplz Date: Mon, 7 Sep 2015 17:57:28 +0800 Subject: [PATCH 062/124] Ignore log dir --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 28f2aca8..434c7112 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,6 @@ LOCK LOG* CURRENT MANIFEST-* + +#log files +log From 097f69cfb98bb8bada7a52014b706bf4ad5f606e Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 8 Sep 2015 14:37:15 +0800 Subject: [PATCH 063/124] ported new layers --- include/caffe/util/ocl_wrapper.hpp | 14 +++ src/caffe/layers/reduction_layer.cpp | 71 +++++++++++++++ src/caffe/layers/relu_layer.cpp | 4 +- .../sigmoid_cross_entropy_loss_layer.cpp | 17 ++++ src/caffe/layers/sigmoid_layer.cpp | 14 +++ src/caffe/layers/silence_layer.cpp | 6 ++ src/caffe/layers/tanh_layer.cpp | 14 +++ src/caffe/layers/threshold_layer.cpp | 6 ++ src/caffe/ocl/sigmoid_layer.cl | 46 ++++++++++ src/caffe/ocl/tanh_layer.cl | 46 ++++++++++ src/caffe/ocl/threshold_layer.cl | 36 ++++++++ src/caffe/util/math_functions.cpp | 9 +- src/caffe/util/ocl_wrapper.cpp | 90 +++++++++++++++++++ 13 files changed, 365 insertions(+), 8 deletions(-) create mode 100644 src/caffe/ocl/sigmoid_layer.cl create mode 100644 src/caffe/ocl/tanh_layer.cl create mode 100644 src/caffe/ocl/threshold_layer.cl diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index d644d16a..0d5f4b2e 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -84,6 +84,20 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff); +template +void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data); + +template +void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff); + +template +void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data); + +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data); diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index c4a8b4e0..4003ddd1 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -125,11 +125,82 @@ void ReductionLayer::Backward_cpu(const vector*>& top, template void ReductionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.gpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_asum(dim_, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_gpu_data(); + caffe_gpu_scal(num_, coeff_, top_data); + } } template void ReductionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (!propagate_down[0]) { return; } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->gpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_set(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_sign(dim_, bottom_data, bottom_diff); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + bottom_diff += dim_; + ++top_diff; + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 784d2c91..c29d5baa 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -43,7 +43,7 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUForward(count,bottom_data,top_data,negative_slope); + ReLUForward(count,bottom_data,top_data,negative_slope); } @@ -57,7 +57,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope); + ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope); } } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 1a4329da..1c22fe19 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -73,6 +73,23 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( template void SigmoidCrossEntropyLossLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); + const Dtype* target = bottom[1]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); + caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_gpu_scal(count, loss_weight / num, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 30ad9b0b..fa13a4c1 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -4,6 +4,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { @@ -42,11 +43,24 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, template void SigmoidLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidForward(count, bottom_data, top_data); } template void SigmoidLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidBackward(count, top_diff, top_data, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index ecd12d12..e36a5cad 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -26,6 +26,12 @@ void SilenceLayer::Forward_gpu(const vector*>& bottom, template void SilenceLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_gpu_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_gpu_data()); + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index abc09bbc..a922adbd 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -6,6 +6,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { @@ -40,11 +41,24 @@ void TanHLayer::Backward_cpu(const vector*>& top, template void TanHLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHForward(count, bottom_data, top_data); } template void TanHLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom){ + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHBackward(count, top_diff, top_data, bottom_diff); +} } diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index 345fd6b7..b3e1bea7 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -2,6 +2,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { @@ -27,6 +28,11 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, template void ThresholdLayer::Forward_gpu(const vector*>& bottom, const vector*>& top){ + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + ThresholdForward(count, threshold_, bottom_data, top_data); } #ifdef CPU_ONLY diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl new file mode 100644 index 00000000..eb952e6f --- /dev/null +++ b/src/caffe/ocl/sigmoid_layer.cl @@ -0,0 +1,46 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void SigmoidForward(const int count, __global T* in, __global T* out){ + int index = get_global_id(0); + if(index < count) + out[index] = 1. / (1. + exp(-in[index])); +} + +template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out); +template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out); + +template +__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){ + int index = get_global_id(0); + const T sigmoid_x = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); +} + +template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); +template __attribute__ ((mangled_name(SigmoidBackward_double))) __kernel void SigmoidBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff); diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl new file mode 100644 index 00000000..2f0a08c6 --- /dev/null +++ b/src/caffe/ocl/tanh_layer.cl @@ -0,0 +1,46 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void TanHForward(const int count, __global T* in, __global T* out){ + int index = get_global_id(0); + if(index < count) + out[index] =tanh(in[index]); +} + +template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out); +template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out); + +template +__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){ + int index = get_global_id(0); + const T tanhx = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); +} + +template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); +template __attribute__ ((mangled_name(TanHBackward_double))) __kernel void TanHBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff); diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl new file mode 100644 index 00000000..40d55f1c --- /dev/null +++ b/src/caffe/ocl/threshold_layer.cl @@ -0,0 +1,36 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out){ + int index = get_global_id(0); + if(index < count) + out[index] =in[index] > threshold ? 1 : 0; +} + +template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out); +template __attribute__ ((mangled_name(ThresholdForward_double))) __kernel void ThresholdForward(const int count, const double threshold, __global double* in, __global double* out); + diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 787f2b16..fb531590 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -32,6 +32,7 @@ #include "caffe/common.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" +#include "caffe/util/ocl_util.hpp" static const clblasOrder order = clblasColumnMajor; #define pi 3.1415926 @@ -659,16 +660,12 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) { template <> void caffe_gpu_set(const int N, const float alpha, float* Y) { - if (alpha == 0) { - return; - } + ocl_memset(Y, alpha, N); } template <> void caffe_gpu_set(const int N, const double alpha, double* Y) { - if (alpha == 0) { - return; - } + ocl_memset(Y, alpha, N); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index b479ddff..c8110c00 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -923,6 +923,96 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da template void ReLUBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); template void ReLUBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); +template +void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data){ + std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SigmoidForward(const int count, const float* bottom_data, float* top_data); +template void SigmoidForward(const int count, const double* bottom_data, double* top_data); + +template +void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){ + std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {(size_t)count}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} +template void SigmoidBackward(const int count, const float* top_diff, const float* top_data, float* bottom_diff); +template void SigmoidBackward(const int count, const double* top_diff, const double* top_data, double* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data){ + std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&threshold); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void ThresholdForward(const int count, const float threshold, const float* bottom_data, float* top_data); +template void ThresholdForward(const int count, const double threshold, const double* bottom_data, double* top_data); + +template +void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data){ + std::string kernel_name = "TanHForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void TanHForward(const int count, const float* bottom_data, float* top_data); +template void TanHForward(const int count, const double* bottom_data, double* top_data); + +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){ + std::string kernel_name = "TanHBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {(size_t)count}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} +template void TanHBackward(const int count, const float* top_diff, const float* top_data, float* bottom_diff); +template void TanHBackward(const int count, const double* top_diff, const double* top_data, double* bottom_diff); + template void opttrans(const Dtype* data_im, const int im_offset, const int channels, const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { From 8266a0afc5f98393117d9977c4941facdd58b8a3 Mon Sep 17 00:00:00 2001 From: Yibing Date: Tue, 8 Sep 2015 20:45:55 +0800 Subject: [PATCH 064/124] Made my own last porting layers go through unit test --- include/caffe/util/math_functions.hpp | 2 +- src/caffe/layers/concat_layer.cpp | 16 +++++++++------- src/caffe/layers/eltwise_layer.cpp | 4 ++-- src/caffe/layers/log_layer.cpp | 4 ++-- src/caffe/ocl/relu_layer.cl | 5 +++-- src/caffe/util/math_functions.cpp | 26 ++++++++++++++++++++++---- 6 files changed, 39 insertions(+), 18 deletions(-) diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 46949ff3..7f398153 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -122,7 +122,7 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) { inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { #ifndef CPU_ONLY - ocl_memset((int*)X, alpha, N); + ocl_memset((int*)X, (alpha<<24)|(alpha<<16)|(alpha<<8)|alpha, N); #else NO_GPU; #endif diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index d1d0e927..6bc8f9e9 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -91,6 +91,7 @@ void ConcatLayer::Backward_cpu(const vector*>& top, template void ConcatLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { + if (bottom.size() == 1) { return; } Dtype* top_data = top[0]->mutable_gpu_data(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); @@ -109,23 +110,24 @@ void ConcatLayer::Forward_gpu(const vector*>& bottom, template void ConcatLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { + if (bottom.size() == 1) { return; } const Dtype* top_diff = top[0]->gpu_diff(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); const bool kForward = false; for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + if (propagate_down[i]) { + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + } offset_concat_axis += bottom_concat_axis; } } - #ifdef CPU_ONLY STUB_GPU(ConcatLayer); #endif diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 61417f8c..5a7e5e74 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -204,7 +204,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, for (int j = 0; j < bottom.size(); ++j) { if (i == j) { continue; } if (!initialized) { - caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); + caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); initialized = true; } else { caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, @@ -218,7 +218,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, break; case EltwiseParameter_EltwiseOp_SUM: if (coeffs_[i] == Dtype(1.)) { - caffe_copy(count, top_diff, bottom_diff); + caffe_gpu_copy(count, top_diff, bottom_diff); } else { caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); } diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 461fd9bf..268c5f5b 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -86,7 +86,7 @@ void LogLayer::Forward_gpu(const vector*>& bottom, if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { caffe_gpu_log(count, bottom_data, top_data); } else { - caffe_copy(count, bottom_data, top_data); + caffe_gpu_copy(count, bottom_data, top_data); if (input_scale_ != Dtype(1)) { caffe_gpu_scal(count, input_scale_, top_data); } @@ -108,7 +108,7 @@ void LogLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); + caffe_gpu_copy(count, bottom_data, bottom_diff); if (input_scale_ != Dtype(1)) { caffe_gpu_scal(count, input_scale_, bottom_diff); } diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index d3b36a34..b7865838 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -37,8 +37,9 @@ template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUFo template __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ int index = get_global_id(0); - if(index < count) - out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope; + if(index < count) { + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); + } } template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index fb531590..bb03b980 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -178,7 +178,7 @@ void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const double* x, size_t offx, const double beta, int incx, double* y, size_t offy, int incy) { clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } @@ -187,12 +187,20 @@ template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, + M, N, (cl_float)alpha, (cl_mem)A, 0, N, + (cl_mem)x, 0, 1, (cl_float)beta, + (cl_mem)y, 0, 1, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double* A, const double* x, const double beta, double* y) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, 0, N, (cl_mem)x, 0, 1, (cl_double)beta, (cl_mem)y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); } template <> @@ -283,11 +291,20 @@ void caffe_copy(const int N, const double* X, double* Y) { cblas_dcopy(N, X, 1, Y, 1); } +//template void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { - OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)X, CL_TRUE, 0, N, Y,0, NULL, NULL); +// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); } +/* +template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); +template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); +template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); +template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); +template void caffe_gpu_memcpy(const size_t N, const float* X, float* Y); +template void caffe_gpu_memcpy(const size_t N, const double* X, double* Y); +*/ template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { @@ -547,7 +564,7 @@ void caffe_gpu_dot(const int n, const float* x, const float* y, float* out) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL); cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL); - clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); + clblasSdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL); clReleaseMemObject(scratchBuff); clReleaseMemObject(d_out); @@ -721,6 +738,7 @@ void caffe_gpu_mul(const int N, const float* a, template <> void caffe_gpu_mul(const int N, const double* a, const double* b, double* y) { + kernel_mul(N, a, b, y); } template <> From c37410b93408dc733d14102037fba2f342bb3c24 Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 8 Sep 2015 17:06:24 -0700 Subject: [PATCH 065/124] fix bug in PReLU layer --- include/caffe/common.hpp | 2 +- include/caffe/util/math_functions.hpp | 3 +++ include/caffe/util/ocl_wrapper.hpp | 2 +- src/caffe/layers/dropout_layer.cpp | 2 +- src/caffe/layers/prelu_layer.cpp | 6 +++--- src/caffe/layers/softmax_loss_layer.cpp | 4 ++-- src/caffe/ocl/prelu_layer.cl | 8 +++++--- src/caffe/util/math_functions.cpp | 12 ++++++++++-- src/caffe/util/ocl_wrapper.cpp | 12 +++++++----- 9 files changed, 33 insertions(+), 18 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index c5bf909d..b84672aa 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -84,7 +84,7 @@ private:\ #define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -#define global_packing_N 16 +#define global_packing_N 100 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 7f398153..b32760aa 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -130,6 +130,9 @@ inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); +template +void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y); + template void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 0d5f4b2e..a15b68ff 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -125,7 +125,7 @@ template void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor); template -void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff); +void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data,const int offset_in, Dtype* bottom_diff); template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 75585a5f..4175a2b7 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -17,7 +17,7 @@ void DropoutLayer::ocl_setup(int bottom_count){ template DropoutLayer::~DropoutLayer(){ -// OCL_CHECK( clReleaseMemObject(MaskMem) ); + OCL_CHECK( clReleaseMemObject(MaskMem) ); } diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index ed51ac5e..426a0cad 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -141,7 +141,7 @@ void PReLULayer::Forward_gpu(const vector*>& bottom, const int div_factor = channel_shared_ ? channels : 1; if (top[0] == bottom[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); } PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor); } @@ -171,8 +171,8 @@ void PReLULayer::Backward_gpu(const vector*>& top, // compute element-wise diff // NOLINT_NEXT_LINE(whitespace/operators) PReLUParamBackward( - cdim, top_diff + top[0]->offset(n), - bottom_data + bottom[0]->offset(n), + cdim, top_diff, top[0]->offset(n), + bottom_data, bottom[0]->offset(n), backward_buff_.mutable_gpu_diff()); if (channel_shared_) { Dtype d; diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index d8db1797..66ac9ea5 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -177,8 +177,8 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const Dtype* prob_data = prob_.gpu_data(); const Dtype* top_data = top[0]->gpu_data(); - // caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); + caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); const Dtype* label = bottom[1]->gpu_data(); const int dim = prob_.count() / outer_num_; const int nthreads = outer_num_ * inner_num_; diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index be85a2e4..6a45ea03 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -48,11 +48,13 @@ template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLU template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); template -__kernel void PReLUParamBackward(const int count, __global T* in_diff, __global T* in_data, __global T* out_diff) { +__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) { int index = get_global_id(0); if(index < count){ + in_diff += offset_out; + out_diff += offset_in; out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); } } -template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff); -template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff); diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index bb03b980..ed71edf6 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -33,6 +33,7 @@ #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" #include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" static const clblasOrder order = clblasColumnMajor; #define pi 3.1415926 @@ -302,9 +303,16 @@ template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); -template void caffe_gpu_memcpy(const size_t N, const float* X, float* Y); -template void caffe_gpu_memcpy(const size_t N, const double* X, double* Y); */ +template<> +void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) +{ OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); +} + +template<> +void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) +{ OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); +} template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index c8110c00..c8f28426 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -869,20 +869,22 @@ template void PReLUBackward(const int count, const int channels, const in template void PReLUBackward(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor); template -void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){ +void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data, const int offset_in, Dtype* bottom_diff){ std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); + ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&offset_out); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data); + ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&offset_in); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff); size_t Global_Work_Size[] = {(size_t)count}; size_t Local_Work_Size[] = {256}; OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void PReLUParamBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff); -template void PReLUParamBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff); +template void PReLUParamBackward(const int count, const float* top_diff, const int offset_out, const float* bottom_data, const int offset_in, float* bottom_diff); +template void PReLUParamBackward(const int count, const double* top_diff, const int offset_out, const double* bottom_data, const int offset_in, double* bottom_diff); template From 454d6761dfd63f7f88648926659a581812daa8a6 Mon Sep 17 00:00:00 2001 From: Junli Date: Tue, 8 Sep 2015 20:57:53 -0700 Subject: [PATCH 066/124] modify conv layers --- src/caffe/layers/base_conv_layer.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 26787393..f77507d9 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -292,10 +292,13 @@ template void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { cl_command_queue Queue; + const Dtype* col_buff = input; if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu_opt(input); } + col_buff = col_buffer_.gpu_data(); + caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem); } #ifdef multiQ for (int g = 0; g < group_; ++g) { @@ -363,8 +366,7 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, const Dtype* weights, Dtype* input) { cl_command_queue Queue; if (is_1x1_) { - int count = height_ * width_ * conv_in_channels_ * opt_num2; - caffe_gpu_copy(count, input, (Dtype*)transMem); + caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem); } for (int g = 0; g < group_; ++g) { #ifdef multiQ @@ -387,6 +389,8 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, if (!is_1x1_) { conv_col2im_gpu_opt(input); + }else{ + caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), (Dtype*)transMem, input); } } @@ -411,10 +415,11 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, cl_command_queue Queue; if (!is_1x1_) { conv_im2col_gpu_opt(input); - } + }else{ + caffe_gpu_memcpy( K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem); + } opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); - for (int g = 0; g < group_; ++g) { #ifdef multiQ if(g == 0) Queue = amdDevice.CommandQueue; From c8e5b9f6d9403890426ec07ced403582cb8e19ef Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 9 Sep 2015 13:33:57 +0800 Subject: [PATCH 067/124] Pass HDF5 layers unit test --- src/caffe/layers/hdf5_data_layer.cpp | 7 ++++--- src/caffe/layers/hdf5_output_layer.cpp | 6 ++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index dda29aee..af223c0f 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -182,9 +182,10 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, } for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); + OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[j]->mutable_gpu_data(), CL_TRUE, i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], 0, NULL, NULL) ); + //caffe_copy(data_dim, + // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); } } } diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index bd608e86..e2bd8e4c 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -80,10 +80,8 @@ void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[0]->gpu_data(), CL_TRUE, i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); + OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[1]->gpu_data(), CL_TRUE, i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL)); } SaveBlobs(); } From 8f700e8c2cd4791a99d772cbd5e2061e45d3796b Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 9 Sep 2015 15:09:52 +0800 Subject: [PATCH 068/124] minor fix --- include/caffe/common.hpp | 4 ++-- src/caffe/layers/base_conv_layer.cpp | 1 + src/caffe/util/math_functions.cpp | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index b84672aa..ac954a0e 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -81,10 +81,10 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ -#define global_packing_N 100 +#define global_packing_N 16 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index f77507d9..394fd9a5 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -298,6 +298,7 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, conv_im2col_gpu_opt(input); } col_buff = col_buffer_.gpu_data(); + }else{ caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem); } #ifdef multiQ diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index ed71edf6..80843191 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -684,12 +684,12 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) { } template <> -void caffe_gpu_set(const int N, const float alpha, float* Y) { +void caffe_gpu_set(const int N, const float alpha, float* Y) { ocl_memset(Y, alpha, N); } template <> -void caffe_gpu_set(const int N, const double alpha, double* Y) { +void caffe_gpu_set(const int N, const double alpha, double* Y) { ocl_memset(Y, alpha, N); } From 8166acfa8400fe3e48dff67a596e87b6029505c7 Mon Sep 17 00:00:00 2001 From: Noplz Date: Wed, 9 Sep 2015 15:49:41 +0800 Subject: [PATCH 069/124] Format the code --- include/caffe/blob.hpp | 490 +-- include/caffe/common.hpp | 141 +- include/caffe/common_layers.hpp | 1050 +++--- include/caffe/data_layers.hpp | 574 +-- include/caffe/data_transformer.hpp | 258 +- include/caffe/device.hpp | 78 +- include/caffe/filler.hpp | 425 +-- include/caffe/internal_thread.hpp | 33 +- include/caffe/layer.hpp | 879 ++--- include/caffe/layer_factory.hpp | 109 +- include/caffe/loss_layers.hpp | 1074 +++--- include/caffe/net.hpp | 467 +-- include/caffe/neuron_layers.hpp | 1336 +++---- include/caffe/python_layer.hpp | 93 +- include/caffe/solver.hpp | 274 +- include/caffe/syncedmem.hpp | 94 +- include/caffe/test/test_caffe_main.hpp | 55 +- .../caffe/test/test_gradient_check_util.hpp | 436 +-- include/caffe/util/benchmark.hpp | 75 +- include/caffe/util/cudnn.hpp | 212 +- include/caffe/util/db.hpp | 65 +- include/caffe/util/db_leveldb.hpp | 117 +- include/caffe/util/db_lmdb.hpp | 141 +- include/caffe/util/im2col.hpp | 123 +- include/caffe/util/insert_splits.hpp | 8 +- include/caffe/util/io.hpp | 111 +- include/caffe/util/math_functions.hpp | 210 +- include/caffe/util/mkl_alternate.hpp | 16 +- include/caffe/util/ocl_util.hpp | 5 +- include/caffe/util/ocl_wrapper.hpp | 421 ++- include/caffe/util/rng.hpp | 33 +- include/caffe/util/upgrade_proto.hpp | 10 +- include/caffe/vision_layers.hpp | 942 ++--- src/caffe/blob.cpp | 754 ++-- src/caffe/common.cpp | 149 +- src/caffe/data_transformer.cpp | 931 +++-- src/caffe/device.cpp | 717 ++-- src/caffe/internal_thread.cpp | 41 +- src/caffe/layer_factory.cpp | 193 +- src/caffe/layers/absval_layer.cpp | 74 +- src/caffe/layers/accuracy_layer.cpp | 132 +- src/caffe/layers/argmax_layer.cpp | 84 +- src/caffe/layers/base_conv_layer.cpp | 749 ++-- src/caffe/layers/base_data_layer.cpp | 165 +- src/caffe/layers/bnll_layer.cpp | 89 +- src/caffe/layers/concat_layer.cpp | 220 +- src/caffe/layers/contrastive_loss_layer.cpp | 312 +- src/caffe/layers/conv_layer.cpp | 396 +-- src/caffe/layers/data_layer.cpp | 186 +- src/caffe/layers/deconv_layer.cpp | 209 +- src/caffe/layers/dropout_layer.cpp | 180 +- src/caffe/layers/dummy_data_layer.cpp | 196 +- src/caffe/layers/eltwise_layer.cpp | 429 +-- src/caffe/layers/euclidean_loss_layer.cpp | 118 +- src/caffe/layers/exp_layer.cpp | 144 +- src/caffe/layers/filter_layer.cpp | 302 +- src/caffe/layers/flatten_layer.cpp | 50 +- src/caffe/layers/hdf5_data_layer.cpp | 337 +- src/caffe/layers/hdf5_output_layer.cpp | 128 +- src/caffe/layers/hinge_loss_layer.cpp | 118 +- src/caffe/layers/im2col_layer.cpp | 174 +- src/caffe/layers/image_data_layer.cpp | 254 +- src/caffe/layers/infogain_loss_layer.cpp | 169 +- src/caffe/layers/inner_product_layer.cpp | 278 +- src/caffe/layers/log_layer.cpp | 210 +- src/caffe/layers/loss_layer.cpp | 26 +- src/caffe/layers/lrn_layer.cpp | 540 +-- src/caffe/layers/memory_data_layer.cpp | 182 +- .../multinomial_logistic_loss_layer.cpp | 88 +- src/caffe/layers/mvn_layer.cpp | 459 +-- src/caffe/layers/neuron_layer.cpp | 8 +- src/caffe/layers/pooling_layer.cpp | 751 ++-- src/caffe/layers/power_layer.cpp | 294 +- src/caffe/layers/prelu_layer.cpp | 356 +- src/caffe/layers/reduction_layer.cpp | 362 +- src/caffe/layers/relu_layer.cpp | 89 +- src/caffe/layers/reshape_layer.cpp | 161 +- .../sigmoid_cross_entropy_loss_layer.cpp | 145 +- src/caffe/layers/sigmoid_layer.cpp | 83 +- src/caffe/layers/silence_layer.cpp | 42 +- src/caffe/layers/slice_layer.cpp | 196 +- src/caffe/layers/softmax_layer.cpp | 249 +- src/caffe/layers/softmax_loss_layer.cpp | 339 +- src/caffe/layers/split_layer.cpp | 119 +- src/caffe/layers/spp_layer.cpp | 321 +- src/caffe/layers/tanh_layer.cpp | 82 +- src/caffe/layers/threshold_layer.cpp | 43 +- src/caffe/layers/window_data_layer.cpp | 783 ++-- src/caffe/net.cpp | 1624 ++++----- src/caffe/ocl/bnll_layer.cl | 24 +- src/caffe/ocl/concat_layer.cl | 48 +- src/caffe/ocl/contrastive_loss_layer.cl | 64 +- src/caffe/ocl/dropout_layer.cl | 21 +- src/caffe/ocl/eltwise_layer.cl | 72 +- src/caffe/ocl/im2col.cl | 441 ++- src/caffe/ocl/lrn_layer.cl | 194 +- src/caffe/ocl/pooling_layer.cl | 460 +-- src/caffe/ocl/prelu_layer.cl | 36 +- src/caffe/ocl/random.cl | 373 +- src/caffe/ocl/relu_layer.cl | 12 +- src/caffe/ocl/sigmoid_layer.cl | 12 +- src/caffe/ocl/softmax_layer.cl | 204 +- src/caffe/ocl/softmaxwithloss_layer.cl | 120 +- src/caffe/ocl/tanh_layer.cl | 12 +- src/caffe/ocl/threshold_layer.cl | 4 +- src/caffe/ocl/util.cl | 185 +- src/caffe/solver.cpp | 1373 ++++---- src/caffe/syncedmem.cpp | 201 +- src/caffe/util/benchmark.cpp | 143 +- src/caffe/util/cudnn.cpp | 28 +- src/caffe/util/db.cpp | 33 +- src/caffe/util/db_leveldb.cpp | 23 +- src/caffe/util/db_lmdb.cpp | 57 +- src/caffe/util/im2col.cpp | 612 ++-- src/caffe/util/im2col.cu | 201 +- src/caffe/util/insert_splits.cpp | 240 +- src/caffe/util/io.cpp | 412 +-- src/caffe/util/math_functions.cpp | 1082 +++--- src/caffe/util/math_functions.cu | 511 +-- src/caffe/util/ocl_util.cpp | 79 +- src/caffe/util/ocl_wrapper.cpp | 3133 ++++++++++------- src/caffe/util/upgrade_proto.cpp | 1740 ++++----- 122 files changed, 19862 insertions(+), 18473 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 12854689..e55ce8e6 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -21,262 +21,282 @@ namespace caffe { * * TODO(dox): more thorough description. */ -template +template class Blob { - public: - Blob() - : data_(), diff_(), count_(0), capacity_(0) {} + public: + Blob() + : data_(), diff_(), count_(0), capacity_(0) { + } - /// @brief Deprecated; use Blob(const vector& shape). - explicit Blob(const int num, const int channels, const int height, - const int width); - explicit Blob(const vector& shape); + /// @brief Deprecated; use Blob(const vector& shape). + explicit Blob(const int num, const int channels, const int height, + const int width); + explicit Blob(const vector& shape); - /// @brief Deprecated; use Reshape(const vector& shape). - void Reshape(const int num, const int channels, const int height, - const int width); - /** - * @brief Change the dimensions of the blob, allocating new memory if - * necessary. - * - * This function can be called both to create an initial allocation - * of memory, and to adjust the dimensions of a top blob during Layer::Reshape - * or Layer::Forward. When changing the size of blob, memory will only be - * reallocated if sufficient memory does not already exist, and excess memory - * will never be freed. - * - * Note that reshaping an input blob and immediately calling Net::Backward is - * an error; either Net::Forward or Net::Reshape need to be called to - * propagate the new input shape to higher layers. - */ - void Reshape(const vector& shape); - void Reshape(const BlobShape& shape); - void ReshapeLike(const Blob& other); - inline string shape_string() const { - ostringstream stream; - for (int i = 0; i < shape_.size(); ++i) { - stream << shape_[i] << " "; - } - stream << "(" << count_ << ")"; - return stream.str(); - } - inline const vector& shape() const { return shape_; } - /** - * @brief Returns the dimension of the index-th axis (or the negative index-th - * axis from the end, if index is negative). - * - * @param index the axis index, which may be negative as it will be - * "canonicalized" using CanonicalAxisIndex. - * Dies on out of range index. - */ - inline int shape(int index) const { - return shape_[CanonicalAxisIndex(index)]; - } - inline int num_axes() const { return shape_.size(); } - inline int count() const { return count_; } + /// @brief Deprecated; use Reshape(const vector& shape). + void Reshape(const int num, const int channels, const int height, + const int width); + /** + * @brief Change the dimensions of the blob, allocating new memory if + * necessary. + * + * This function can be called both to create an initial allocation + * of memory, and to adjust the dimensions of a top blob during Layer::Reshape + * or Layer::Forward. When changing the size of blob, memory will only be + * reallocated if sufficient memory does not already exist, and excess memory + * will never be freed. + * + * Note that reshaping an input blob and immediately calling Net::Backward is + * an error; either Net::Forward or Net::Reshape need to be called to + * propagate the new input shape to higher layers. + */ + void Reshape(const vector& shape); + void Reshape(const BlobShape& shape); + void ReshapeLike(const Blob& other); + inline string shape_string() const { + ostringstream stream; + for (int i = 0; i < shape_.size(); ++i) { + stream << shape_[i] << " "; + } + stream << "(" << count_ << ")"; + return stream.str(); + } + inline const vector& shape() const { + return shape_; + } + /** + * @brief Returns the dimension of the index-th axis (or the negative index-th + * axis from the end, if index is negative). + * + * @param index the axis index, which may be negative as it will be + * "canonicalized" using CanonicalAxisIndex. + * Dies on out of range index. + */ + inline int shape(int index) const { + return shape_[CanonicalAxisIndex(index)]; + } + inline int num_axes() const { + return shape_.size(); + } + inline int count() const { + return count_; + } - /** - * @brief Compute the volume of a slice; i.e., the product of dimensions - * among a range of axes. - * - * @param start_axis The first axis to include in the slice. - * - * @param end_axis The first axis to exclude from the slice. - */ - inline int count(int start_axis, int end_axis) const { - CHECK_LE(start_axis, end_axis); - CHECK_GE(start_axis, 0); - CHECK_GE(end_axis, 0); - CHECK_LE(start_axis, num_axes()); - CHECK_LE(end_axis, num_axes()); - int count = 1; - for (int i = start_axis; i < end_axis; ++i) { - count *= shape(i); - } - return count; - } - /** - * @brief Compute the volume of a slice spanning from a particular first - * axis to the final axis. - * - * @param start_axis The first axis to include in the slice. - */ - inline int count(int start_axis) const { - return count(start_axis, num_axes()); - } + /** + * @brief Compute the volume of a slice; i.e., the product of dimensions + * among a range of axes. + * + * @param start_axis The first axis to include in the slice. + * + * @param end_axis The first axis to exclude from the slice. + */ + inline int count(int start_axis, int end_axis) const { + CHECK_LE(start_axis, end_axis); + CHECK_GE(start_axis, 0); + CHECK_GE(end_axis, 0); + CHECK_LE(start_axis, num_axes()); + CHECK_LE(end_axis, num_axes()); + int count = 1; + for (int i = start_axis; i < end_axis; ++i) { + count *= shape(i); + } + return count; + } + /** + * @brief Compute the volume of a slice spanning from a particular first + * axis to the final axis. + * + * @param start_axis The first axis to include in the slice. + */ + inline int count(int start_axis) const { + return count(start_axis, num_axes()); + } - /** - * @brief Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param index the axis index. - * If 0 <= index < num_axes(), return index. - * If -num_axes <= index <= -1, return (num_axes() - (-index)), - * e.g., the last axis index (num_axes() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int CanonicalAxisIndex(int axis_index) const { - CHECK_GE(axis_index, -num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - CHECK_LT(axis_index, num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - if (axis_index < 0) { - return axis_index + num_axes(); - } - return axis_index; - } + /** + * @brief Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param index the axis index. + * If 0 <= index < num_axes(), return index. + * If -num_axes <= index <= -1, return (num_axes() - (-index)), + * e.g., the last axis index (num_axes() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ + inline int CanonicalAxisIndex(int axis_index) const { + CHECK_GE(axis_index, -num_axes()) + << "axis " << axis_index << " out of range for " << num_axes() + << "-D Blob with shape " << shape_string(); + CHECK_LT(axis_index, num_axes()) + << "axis " << axis_index << " out of range for " << num_axes() + << "-D Blob with shape " << shape_string(); + if (axis_index < 0) { + return axis_index + num_axes(); + } + return axis_index; + } - /// @brief Deprecated legacy shape accessor num: use shape(0) instead. - inline int num() const { return LegacyShape(0); } - /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. - inline int channels() const { return LegacyShape(1); } - /// @brief Deprecated legacy shape accessor height: use shape(2) instead. - inline int height() const { return LegacyShape(2); } - /// @brief Deprecated legacy shape accessor width: use shape(3) instead. - inline int width() const { return LegacyShape(3); } - inline int LegacyShape(int index) const { - CHECK_LE(num_axes(), 4) - << "Cannot use legacy accessors on Blobs with > 4 axes."; - CHECK_LT(index, 4); - CHECK_GE(index, -4); - if (index >= num_axes() || index < -num_axes()) { - // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse - // indexing) -- this special case simulates the one-padding used to fill - // extraneous axes of legacy blobs. - return 1; - } - return shape(index); - } + /// @brief Deprecated legacy shape accessor num: use shape(0) instead. + inline int num() const { + return LegacyShape(0); + } + /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. + inline int channels() const { + return LegacyShape(1); + } + /// @brief Deprecated legacy shape accessor height: use shape(2) instead. + inline int height() const { + return LegacyShape(2); + } + /// @brief Deprecated legacy shape accessor width: use shape(3) instead. + inline int width() const { + return LegacyShape(3); + } + inline int LegacyShape(int index) const { + CHECK_LE(num_axes(), 4) + << "Cannot use legacy accessors on Blobs with > 4 axes."; + CHECK_LT(index, 4); + CHECK_GE(index, -4); + if (index >= num_axes() || index < -num_axes()) { + // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse + // indexing) -- this special case simulates the one-padding used to fill + // extraneous axes of legacy blobs. + return 1; + } + return shape(index); + } - inline int offset(const int n, const int c = 0, const int h = 0, - const int w = 0) const { - CHECK_GE(n, 0); - CHECK_LE(n, num()); - CHECK_GE(channels(), 0); - CHECK_LE(c, channels()); - CHECK_GE(height(), 0); - CHECK_LE(h, height()); - CHECK_GE(width(), 0); - CHECK_LE(w, width()); - return ((n * channels() + c) * height() + h) * width() + w; - } + inline int offset(const int n, const int c = 0, const int h = 0, + const int w = 0) const { + CHECK_GE(n, 0); + CHECK_LE(n, num()); + CHECK_GE(channels(), 0); + CHECK_LE(c, channels()); + CHECK_GE(height(), 0); + CHECK_LE(h, height()); + CHECK_GE(width(), 0); + CHECK_LE(w, width()); + return ((n * channels() + c) * height() + h) * width() + w; + } - inline int offset(const vector& indices) const { - CHECK_LE(indices.size(), num_axes()); - int offset = 0; - for (int i = 0; i < num_axes(); ++i) { - offset *= shape(i); - if (indices.size() > i) { - CHECK_GE(indices[i], 0); - CHECK_LT(indices[i], shape(i)); - offset += indices[i]; - } - } - return offset; - } - /** - * @brief Copy from a source Blob. - * - * @param source the Blob to copy from - * @param copy_diff if false, copy the data; if true, copy the diff - * @param reshape if false, require this Blob to be pre-shaped to the shape - * of other (and die otherwise); if true, Reshape this Blob to other's - * shape if necessary - */ - void CopyFrom(const Blob& source, bool copy_diff = false, - bool reshape = false); + inline int offset(const vector& indices) const { + CHECK_LE(indices.size(), num_axes()); + int offset = 0; + for (int i = 0; i < num_axes(); ++i) { + offset *= shape(i); + if (indices.size() > i) { + CHECK_GE(indices[i], 0); + CHECK_LT(indices[i], shape(i)); + offset += indices[i]; + } + } + return offset; + } + /** + * @brief Copy from a source Blob. + * + * @param source the Blob to copy from + * @param copy_diff if false, copy the data; if true, copy the diff + * @param reshape if false, require this Blob to be pre-shaped to the shape + * of other (and die otherwise); if true, Reshape this Blob to other's + * shape if necessary + */ + void CopyFrom(const Blob& source, bool copy_diff = false, + bool reshape = false); - inline Dtype data_at(const int n, const int c, const int h, - const int w) const { - return cpu_data()[offset(n, c, h, w)]; - } + inline Dtype data_at(const int n, const int c, const int h, + const int w) const { + return cpu_data()[offset(n, c, h, w)]; + } - inline Dtype diff_at(const int n, const int c, const int h, - const int w) const { - return cpu_diff()[offset(n, c, h, w)]; - } + inline Dtype diff_at(const int n, const int c, const int h, + const int w) const { + return cpu_diff()[offset(n, c, h, w)]; + } - inline Dtype data_at(const vector& index) const { - return cpu_data()[offset(index)]; - } + inline Dtype data_at(const vector& index) const { + return cpu_data()[offset(index)]; + } - inline Dtype diff_at(const vector& index) const { - return cpu_diff()[offset(index)]; - } + inline Dtype diff_at(const vector& index) const { + return cpu_diff()[offset(index)]; + } - inline const shared_ptr& data() const { - CHECK(data_); - return data_; - } + inline const shared_ptr& data() const { + CHECK(data_); + return data_; + } - inline const shared_ptr& diff() const { - CHECK(diff_); - return diff_; - } + inline const shared_ptr& diff() const { + CHECK(diff_); + return diff_; + } - const Dtype* cpu_data() const; - void set_cpu_data(Dtype* data); - const Dtype* gpu_data() const; - const Dtype* gpu_cache_data() const; - const Dtype* cpu_diff() const; - const Dtype* gpu_diff() const; - Dtype* mutable_cpu_data(); - Dtype* mutable_gpu_data(); - Dtype* mutable_cpu_diff(); - Dtype* mutable_gpu_diff(); - void Update(); - void FromProto(const BlobProto& proto, bool reshape = true); - void ToProto(BlobProto* proto, bool write_diff = false) const; + const Dtype* cpu_data() const; + void set_cpu_data(Dtype* data); + const Dtype* gpu_data() const; + const Dtype* gpu_cache_data() const; + const Dtype* cpu_diff() const; + const Dtype* gpu_diff() const; + Dtype* mutable_cpu_data(); + Dtype* mutable_gpu_data(); + Dtype* mutable_cpu_diff(); + Dtype* mutable_gpu_diff(); + void Update(); + void FromProto(const BlobProto& proto, bool reshape = true); + void ToProto(BlobProto* proto, bool write_diff = false) const; - /// @brief Compute the sum of absolute values (L1 norm) of the data. - Dtype asum_data() const; - /// @brief Compute the sum of absolute values (L1 norm) of the diff. - Dtype asum_diff() const; - /// @brief Compute the sum of squares (L2 norm squared) of the data. - Dtype sumsq_data() const; - /// @brief Compute the sum of squares (L2 norm squared) of the diff. - Dtype sumsq_diff() const; + /// @brief Compute the sum of absolute values (L1 norm) of the data. + Dtype asum_data() const; + /// @brief Compute the sum of absolute values (L1 norm) of the diff. + Dtype asum_diff() const; + /// @brief Compute the sum of squares (L2 norm squared) of the data. + Dtype sumsq_data() const; + /// @brief Compute the sum of squares (L2 norm squared) of the diff. + Dtype sumsq_diff() const; - /// @brief Scale the blob data by a constant factor. - void scale_data(Dtype scale_factor); - /// @brief Scale the blob diff by a constant factor. - void scale_diff(Dtype scale_factor); + /// @brief Scale the blob data by a constant factor. + void scale_data(Dtype scale_factor); + /// @brief Scale the blob diff by a constant factor. + void scale_diff(Dtype scale_factor); - /** - * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the - * data_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's data_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareData(const Blob& other); - /** - * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the - * diff_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's diff_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareDiff(const Blob& other); - void set_data_layer(){data_->set_data_layer(); diff_->set_data_layer();}; + /** + * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the + * data_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's data_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareData(const Blob& other); + /** + * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the + * diff_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's diff_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareDiff(const Blob& other); + void set_data_layer() { + data_->set_data_layer(); + diff_->set_data_layer(); + } + ; - bool ShapeEquals(const BlobProto& other); + bool ShapeEquals(const BlobProto& other); - protected: - shared_ptr data_; - shared_ptr diff_; - vector shape_; - int count_; - int capacity_; + protected: + shared_ptr data_; + shared_ptr diff_; + vector shape_; + int count_; + int capacity_; - DISABLE_COPY_AND_ASSIGN(Blob); -}; // class Blob + DISABLE_COPY_AND_ASSIGN (Blob); +}; +// class Blob -} // namespace caffe +}// namespace caffe #endif // CAFFE_BLOB_HPP_ diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index ac954a0e..0f3a7667 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -73,23 +73,20 @@ private:\ #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet" //OpenCL: various of defines to choose the design schemes /* ifdef: use CPU random generator in dropout layer - ifndef: use GPU randome generator*/ + ifndef: use GPU randome generator*/ //#define use_cpu_generator_dropout - //#define print_memory_trace - //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ #define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme - for intial design, we use the same packing number for all conv layers*/ + for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 /*ifdef: use multi-command queues for groups in conv layer; ifndef: use single commane queue for groups*/ //#define multiQ //#define check_gradient - // OpenCL: various checks for different function calls. #define OCL_CHECK(condition) \ do { \ @@ -156,7 +153,9 @@ do{ \ }while(0) // See PR #1236 -namespace cv { class Mat; } +namespace cv { +class Mat; +} namespace caffe { @@ -186,77 +185,81 @@ void GlobalInit(int* pargc, char*** pargv); // A singleton class to hold common caffe stuff, such as the handler that // caffe is going to use for cublas, curand, etc. class Caffe { - public: - ~Caffe(); - inline static Caffe& Get() { - if (!singleton_.get()) { - singleton_.reset(new Caffe()); - } - return *singleton_; - } - enum Brew { CPU, GPU, APU }; - - // This random number generator facade hides boost and CUDA rng - // implementation from one another (for cross-platform compatibility). - class RNG { - public: - RNG(); - explicit RNG(unsigned int seed); - explicit RNG(const RNG&); - RNG& operator=(const RNG&); - void* generator(); - private: - class Generator; - shared_ptr generator_; - }; - - // Getters for boost rng, curand, and cublas handles - inline static RNG& rng_stream() { - if (!Get().random_generator_) { - Get().random_generator_.reset(new RNG()); - } - return *(Get().random_generator_); - } + public: + ~Caffe(); + inline static Caffe& Get() { + if (!singleton_.get()) { + singleton_.reset(new Caffe()); + } + return *singleton_; + } + enum Brew { + CPU, GPU, APU + }; + + // This random number generator facade hides boost and CUDA rng + // implementation from one another (for cross-platform compatibility). + class RNG { + public: + RNG(); + explicit RNG(unsigned int seed); + explicit RNG(const RNG&); + RNG& operator=(const RNG&); + void* generator(); + private: + class Generator; + shared_ptr generator_; + }; + + // Getters for boost rng, curand, and cublas handles + inline static RNG& rng_stream() { + if (!Get().random_generator_) { + Get().random_generator_.reset(new RNG()); + } + return *(Get().random_generator_); + } #ifndef CPU_ONLY - //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } - //inline static curandGenerator_t curand_generator() { - // return Get().curand_generator_; - //} + //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } + //inline static curandGenerator_t curand_generator() { + // return Get().curand_generator_; + //} #endif - // Returns the mode: running on CPU or GPU. - inline static Brew mode() { return Get().mode_; } - // The setters for the variables - // Sets the mode. It is recommended that you don't change the mode halfway - // into the program since that may cause allocation of pinned memory being - // freed in a non-pinned way, which may cause problems - I haven't verified - // it personally but better to note it here in the header file. - inline static void set_mode(Brew mode) { - Get().mode_ = mode; - } - // Sets the random seed of both boost and curand - static void set_random_seed(const unsigned int seed); - // Sets the device. Since we have cublas and curand stuff, set device also - // requires us to reset those values. - static void SetDevice(const int device_id); - // Prints the current GPU status. - static void DeviceQuery(); - - protected: + // Returns the mode: running on CPU or GPU. + inline static Brew mode() { + return Get().mode_; + } + // The setters for the variables + // Sets the mode. It is recommended that you don't change the mode halfway + // into the program since that may cause allocation of pinned memory being + // freed in a non-pinned way, which may cause problems - I haven't verified + // it personally but better to note it here in the header file. + inline static void set_mode(Brew mode) { + Get().mode_ = mode; + } + // Sets the random seed of both boost and curand + static void set_random_seed(const unsigned int seed); + // Sets the device. Since we have cublas and curand stuff, set device also + // requires us to reset those values. + static void SetDevice(const int device_id); + // Prints the current GPU status. + static void DeviceQuery(); + + protected: #ifndef CPU_ONLY - //cublasHandle_t cublas_handle_; - //curandGenerator_t curand_generator_; + //cublasHandle_t cublas_handle_; + //curandGenerator_t curand_generator_; #endif - shared_ptr random_generator_; + shared_ptr random_generator_; - Brew mode_; - static shared_ptr singleton_; + Brew mode_; + static shared_ptr singleton_; - private: - // The private constructor to avoid duplicate instantiation. - Caffe(); + private: + // The private constructor to avoid duplicate instantiation. + Caffe(); - DISABLE_COPY_AND_ASSIGN(Caffe); + DISABLE_COPY_AND_ASSIGN(Caffe); }; } // namespace caffe diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index eb77e762..879e84e7 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -25,122 +25,136 @@ namespace caffe { * * NOTE: does not implement Backwards operation. */ -template -class ArgMaxLayer : public Layer { - public: - /** - * @param param provides ArgMaxParameter argmax_param, - * with ArgMaxLayer options: - * - top_k (\b optional uint, default 1). - * the number @f$ K @f$ of maximal items to output. - * - out_max_val (\b optional bool, default false). - * if set, output a vector of pairs (max_ind, max_val) for each image. - */ - explicit ArgMaxLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "ArgMax"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val - * @f$ (N \times 2 \times K \times 1) @f$ - * the computed outputs @f$ - * y_n = \arg\max\limits_i x_{ni} - * @f$ (for @f$ K = 1 @f$). - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - NOT_IMPLEMENTED; - } - bool out_max_val_; - size_t top_k_; +template +class ArgMaxLayer: public Layer { + public: + /** + * @param param provides ArgMaxParameter argmax_param, + * with ArgMaxLayer options: + * - top_k (\b optional uint, default 1). + * the number @f$ K @f$ of maximal items to output. + * - out_max_val (\b optional bool, default false). + * if set, output a vector of pairs (max_ind, max_val) for each image. + */ + explicit ArgMaxLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ArgMax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val + * @f$ (N \times 2 \times K \times 1) @f$ + * the computed outputs @f$ + * y_n = \arg\max\limits_i x_{ni} + * @f$ (for @f$ K = 1 @f$). + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + NOT_IMPLEMENTED; + } + bool out_max_val_; + size_t top_k_; }; /** * @brief Takes at least two Blob%s and concatenates them along either the num * or channel dimension, outputting the result. */ -template -class ConcatLayer : public Layer { - public: - explicit ConcatLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Concat"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_1 @f$ - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_2 @f$ - * -# ... - * - K @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_K @f$ - * @param top output Blob vector (length 1) - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * the concatenated output @f$ - * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to concatenated outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top gradient - * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the - * inputs @f$ - * \left[ \begin{array}{cccc} - * \frac{\partial E}{\partial x_1} & - * \frac{\partial E}{\partial x_2} & - * ... & - * \frac{\partial E}{\partial x_K} - * \end{array} \right] = - * \frac{\partial E}{\partial y} - * @f$ - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_concats_; - int concat_input_size_; - int concat_axis_; +template +class ConcatLayer: public Layer { + public: + explicit ConcatLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Concat"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_1 @f$ + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_2 @f$ + * -# ... + * - K @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_K @f$ + * @param top output Blob vector (length 1) + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * the concatenated output @f$ + * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to concatenated outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top gradient + * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the + * inputs @f$ + * \left[ \begin{array}{cccc} + * \frac{\partial E}{\partial x_1} & + * \frac{\partial E}{\partial x_2} & + * ... & + * \frac{\partial E}{\partial x_K} + * \end{array} \right] = + * \frac{\partial E}{\partial y} + * @f$ + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_concats_; + int concat_input_size_; + int concat_axis_; }; /** @@ -149,35 +163,42 @@ class ConcatLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class EltwiseLayer : public Layer { - public: - explicit EltwiseLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Eltwise"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - EltwiseParameter_EltwiseOp op_; - vector coeffs_; - Blob max_idx_; - - bool stable_prod_grad_; +template +class EltwiseLayer: public Layer { + public: + explicit EltwiseLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Eltwise"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + EltwiseParameter_EltwiseOp op_; + vector coeffs_; + Blob max_idx_; + + bool stable_prod_grad_; }; /** @@ -186,61 +207,68 @@ class EltwiseLayer : public Layer { * the corresponding item has to be filtered, non-zero means that corresponding * item needs to stay). */ -template -class FilterLayer : public Layer { - public: - explicit FilterLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Filter"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_1 @f$ - * -# ... - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_K @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the selector blob - * @param top output Blob vector (length 1+) - * -# @f$ (S \times C \times H \times W) @f$ () - * the filtered output @f$ x_1 @f$ - * where S is the number of items - * that haven't been filtered - * @f$ (S \times C \times H \times W) @f$ - * the filtered output @f$ x_K @f$ - * where S is the number of items - * that haven't been filtered - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the forwarded inputs. - * - * @param top output Blob vector (length 1+), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2+), into which the top error - * gradient is copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool first_reshape_; - vector indices_to_forward_; +template +class FilterLayer: public Layer { + public: + explicit FilterLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Filter"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_1 @f$ + * -# ... + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_K @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the selector blob + * @param top output Blob vector (length 1+) + * -# @f$ (S \times C \times H \times W) @f$ () + * the filtered output @f$ x_1 @f$ + * where S is the number of items + * that haven't been filtered + * @f$ (S \times C \times H \times W) @f$ + * the filtered output @f$ x_K @f$ + * where S is the number of items + * that haven't been filtered + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the forwarded inputs. + * + * @param top output Blob vector (length 1+), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2+), into which the top error + * gradient is copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool first_reshape_; + vector indices_to_forward_; }; /** @@ -253,41 +281,48 @@ class FilterLayer : public Layer { * and in Backward, the diff pointer of the bottom Blob to that of the top Blob * (see Blob::ShareDiff). */ -template -class FlattenLayer : public Layer { - public: - explicit FlattenLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Flatten"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs - * @param top output Blob vector (length 1) - * -# @f$ (N \times CHW \times 1 \times 1) @f$ - * the outputs -- i.e., the (virtually) copied, flattened inputs - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top error - * gradient is (virtually) copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class FlattenLayer: public Layer { + public: + explicit FlattenLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Flatten"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs + * @param top output Blob vector (length 1) + * -# @f$ (N \times CHW \times 1 \times 1) @f$ + * the outputs -- i.e., the (virtually) copied, flattened inputs + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top error + * gradient is (virtually) copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -296,35 +331,42 @@ class FlattenLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class InnerProductLayer : public Layer { - public: - explicit InnerProductLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "InnerProduct"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int M_; - int K_; - int N_; - bool bias_term_; - Blob bias_multiplier_; +template +class InnerProductLayer: public Layer { + public: + explicit InnerProductLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "InnerProduct"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int M_; + int K_; + int N_; + bool bias_term_; + Blob bias_multiplier_; }; /** @@ -332,33 +374,40 @@ class InnerProductLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class MVNLayer : public Layer { - public: - explicit MVNLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MVN"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob mean_, variance_, temp_; - - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - Dtype eps_; +template +class MVNLayer: public Layer { + public: + explicit MVNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MVN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob mean_, variance_, temp_; + + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + Dtype eps_; }; /* @@ -367,36 +416,47 @@ class MVNLayer : public Layer { * Note: similarly to FlattenLayer, this layer does not change the input values * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff). */ -template -class ReshapeLayer : public Layer { - public: - explicit ReshapeLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Reshape"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) {} - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - /// @brief vector of axes indices whose dimensions we'll copy from the bottom - vector copy_axes_; - /// @brief the index of the axis whose dimension we infer, or -1 if none - int inferred_axis_; - /// @brief the product of the "constant" output dimensions - int constant_count_; +template +class ReshapeLayer: public Layer { + public: + explicit ReshapeLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reshape"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + + /// @brief vector of axes indices whose dimensions we'll copy from the bottom + vector copy_axes_; + /// @brief the index of the axis whose dimension we infer, or -1 if none + int inferred_axis_; + /// @brief the product of the "constant" output dimensions + int constant_count_; }; /** @@ -406,71 +466,87 @@ class ReshapeLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class ReductionLayer : public Layer { - public: - explicit ReductionLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Reduction"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief the reduction operation performed by the layer - ReductionParameter_ReductionOp op_; - /// @brief a scalar coefficient applied to all outputs - Dtype coeff_; - /// @brief the index of the first input axis to reduce - int axis_; - /// @brief the number of reductions performed - int num_; - /// @brief the input size of each reduction - int dim_; - /// @brief a helper Blob used for summation (op_ == SUM) - Blob sum_multiplier_; +template +class ReductionLayer: public Layer { + public: + explicit ReductionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reduction"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief the reduction operation performed by the layer + ReductionParameter_ReductionOp op_; + /// @brief a scalar coefficient applied to all outputs + Dtype coeff_; + /// @brief the index of the first input axis to reduce + int axis_; + /// @brief the number of reductions performed + int num_; + /// @brief the input size of each reduction + int dim_; + /// @brief a helper Blob used for summation (op_ == SUM) + Blob sum_multiplier_; }; /** * @brief Ignores bottom blobs while producing no top blobs. (This is useful * to suppress outputs during testing.) */ -template -class SilenceLayer : public Layer { - public: - explicit SilenceLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "Silence"; } - virtual inline int MinBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 0; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) {} - // We can't define Forward_gpu here, since STUB_GPU will provide - // its own definition for CPU_ONLY mode. - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class SilenceLayer: public Layer { + public: + explicit SilenceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "Silence"; + } + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + // We can't define Forward_gpu here, since STUB_GPU will provide + // its own definition for CPU_ONLY mode. + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -478,37 +554,43 @@ class SilenceLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class SoftmaxLayer : public Layer { - public: - explicit SoftmaxLayer(const LayerParameter& param) - : Layer(param) { - } - ~SoftmaxLayer(); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Softmax"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int outer_num_; - int inner_num_; - int softmax_axis_; - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - /// scale is an intermediate Blob to hold temporary results. - Blob scale_; +template +class SoftmaxLayer: public Layer { + public: + explicit SoftmaxLayer(const LayerParameter& param) + : Layer(param) { + } + ~SoftmaxLayer(); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Softmax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int outer_num_; + int inner_num_; + int softmax_axis_; + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + /// scale is an intermediate Blob to hold temporary results. + Blob scale_; }; #ifdef USE_CUDNN @@ -518,25 +600,25 @@ class SoftmaxLayer : public Layer { */ template class CuDNNSoftmaxLayer : public SoftmaxLayer { - public: - explicit CuDNNSoftmaxLayer(const LayerParameter& param) - : SoftmaxLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNSoftmaxLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNSoftmaxLayer(const LayerParameter& param) + : SoftmaxLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNSoftmaxLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -546,30 +628,37 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class SplitLayer : public Layer { - public: - explicit SplitLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Split"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - cl_kernel gpu_add_kernel; +template +class SplitLayer: public Layer { + public: + explicit SplitLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Split"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + cl_kernel gpu_add_kernel; }; /** @@ -578,35 +667,42 @@ class SplitLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class SliceLayer : public Layer { - public: - explicit SliceLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Slice"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 2; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_slices_; - int slice_size_; - int slice_axis_; - vector slice_point_; +template +class SliceLayer: public Layer { + public: + explicit SliceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Slice"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 2; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_slices_; + int slice_size_; + int slice_axis_; + vector slice_point_; }; } // namespace caffe diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 3958cb7e..442e4009 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -24,79 +24,94 @@ namespace caffe { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class BaseDataLayer : public Layer { - public: - explicit BaseDataLayer(const LayerParameter& param); - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden except by the BasePrefetchingDataLayer. - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top) {} - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - protected: - TransformationParameter transform_param_; - shared_ptr > data_transformer_; - bool output_labels_; +template +class BaseDataLayer: public Layer { + public: + explicit BaseDataLayer(const LayerParameter& param); + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden except by the BasePrefetchingDataLayer. + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top) { + } + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + + protected: + TransformationParameter transform_param_; + shared_ptr > data_transformer_; + bool output_labels_; }; -template -class BasePrefetchingDataLayer : - public BaseDataLayer, public InternalThread { - public: - explicit BasePrefetchingDataLayer(const LayerParameter& param) - : BaseDataLayer(param) {} - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden. - void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - virtual void CreatePrefetchThread(); - virtual void JoinPrefetchThread(); - // The thread's function - virtual void InternalThreadEntry() {} - - protected: - Blob prefetch_data_; - Blob prefetch_label_; - Blob transformed_data_; +template +class BasePrefetchingDataLayer: + public BaseDataLayer, public InternalThread { + public: + explicit BasePrefetchingDataLayer(const LayerParameter& param) + : BaseDataLayer(param) { + } + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden. + void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + virtual void CreatePrefetchThread(); + virtual void JoinPrefetchThread(); + // The thread's function + virtual void InternalThreadEntry() { + } + + protected: + Blob prefetch_data_; + Blob prefetch_label_; + Blob transformed_data_; }; -template -class DataLayer : public BasePrefetchingDataLayer { - public: - explicit DataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~DataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Data"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline int MaxTopBlobs() const { return 2; } - - protected: - virtual void InternalThreadEntry(); - - shared_ptr db_; - shared_ptr cursor_; +template +class DataLayer: public BasePrefetchingDataLayer { + public: + explicit DataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~DataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + virtual void InternalThreadEntry(); + + shared_ptr db_; + shared_ptr cursor_; }; /** @@ -104,31 +119,41 @@ class DataLayer : public BasePrefetchingDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class DummyDataLayer : public Layer { - public: - explicit DummyDataLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "DummyData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - vector > > fillers_; - vector refill_; +template +class DummyDataLayer: public Layer { + public: + explicit DummyDataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "DummyData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + + vector > > fillers_; + vector refill_; }; /** @@ -136,40 +161,50 @@ class DummyDataLayer : public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class HDF5DataLayer : public Layer { - public: - explicit HDF5DataLayer(const LayerParameter& param) - : Layer(param) {} - virtual ~HDF5DataLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "HDF5Data"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void LoadHDF5FileData(const char* filename); - - std::vector hdf_filenames_; - unsigned int num_files_; - unsigned int current_file_; - hsize_t current_row_; - std::vector > > hdf_blobs_; - std::vector data_permutation_; - std::vector file_permutation_; +template +class HDF5DataLayer: public Layer { + public: + explicit HDF5DataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~HDF5DataLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + } + virtual void LoadHDF5FileData(const char* filename); + + std::vector hdf_filenames_; + unsigned int num_files_; + unsigned int current_file_; + hsize_t current_row_; + std::vector > > hdf_blobs_; + std::vector data_permutation_; + std::vector file_permutation_; }; /** @@ -177,41 +212,51 @@ class HDF5DataLayer : public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class HDF5OutputLayer : public Layer { - public: - explicit HDF5OutputLayer(const LayerParameter& param) - : Layer(param), file_opened_(false) {} - virtual ~HDF5OutputLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "HDF5Output"; } - // TODO: no limit on the number of blobs - virtual inline int ExactNumBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 0; } - - inline std::string file_name() const { return file_name_; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void SaveBlobs(); - - bool file_opened_; - std::string file_name_; - hid_t file_id_; - Blob data_blob_; - Blob label_blob_; +template +class HDF5OutputLayer: public Layer { + public: + explicit HDF5OutputLayer(const LayerParameter& param) + : Layer(param), file_opened_(false) { + } + virtual ~HDF5OutputLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Output"; + } + // TODO: no limit on the number of blobs + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + inline std::string file_name() const { + return file_name_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void SaveBlobs(); + + bool file_opened_; + std::string file_name_; + hid_t file_id_; + Blob data_blob_; + Blob label_blob_; }; /** @@ -219,26 +264,33 @@ class HDF5OutputLayer : public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class ImageDataLayer : public BasePrefetchingDataLayer { - public: - explicit ImageDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~ImageDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "ImageData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - protected: - shared_ptr prefetch_rng_; - virtual void ShuffleImages(); - virtual void InternalThreadEntry(); - - vector > lines_; - int lines_id_; +template +class ImageDataLayer: public BasePrefetchingDataLayer { + public: + explicit ImageDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~ImageDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ImageData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + shared_ptr prefetch_rng_; + virtual void ShuffleImages(); + virtual void InternalThreadEntry(); + + vector > lines_; + int lines_id_; }; /** @@ -246,44 +298,59 @@ class ImageDataLayer : public BasePrefetchingDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class MemoryDataLayer : public BaseDataLayer { - public: - explicit MemoryDataLayer(const LayerParameter& param) - : BaseDataLayer(param), has_new_data_(false) {} - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MemoryData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - virtual void AddDatumVector(const vector& datum_vector); - virtual void AddMatVector(const vector& mat_vector, - const vector& labels); - - // Reset should accept const pointers, but can't, because the memory - // will be given to Blob, which is mutable - void Reset(Dtype* data, Dtype* label, int n); - void set_batch_size(int new_size); - - int batch_size() { return batch_size_; } - int channels() { return channels_; } - int height() { return height_; } - int width() { return width_; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - int batch_size_, channels_, height_, width_, size_; - Dtype* data_; - Dtype* labels_; - int n_; - size_t pos_; - Blob added_data_; - Blob added_label_; - bool has_new_data_; +template +class MemoryDataLayer: public BaseDataLayer { + public: + explicit MemoryDataLayer(const LayerParameter& param) + : BaseDataLayer(param), has_new_data_(false) { + } + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MemoryData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + virtual void AddDatumVector(const vector& datum_vector); + virtual void AddMatVector(const vector& mat_vector, + const vector& labels); + + // Reset should accept const pointers, but can't, because the memory + // will be given to Blob, which is mutable + void Reset(Dtype* data, Dtype* label, int n); + void set_batch_size(int new_size); + + int batch_size() { + return batch_size_; + } + int channels() { + return channels_; + } + int height() { + return height_; + } + int width() { + return width_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + int batch_size_, channels_, height_, width_, size_; + Dtype* data_; + Dtype* labels_; + int n_; + size_t pos_; + Blob added_data_; + Blob added_label_; + bool has_new_data_; }; /** @@ -292,34 +359,43 @@ class MemoryDataLayer : public BaseDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template -class WindowDataLayer : public BasePrefetchingDataLayer { - public: - explicit WindowDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~WindowDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "WindowData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - protected: - virtual unsigned int PrefetchRand(); - virtual void InternalThreadEntry(); - - shared_ptr prefetch_rng_; - vector > > image_database_; - enum WindowField { IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM }; - vector > fg_windows_; - vector > bg_windows_; - Blob data_mean_; - vector mean_values_; - bool has_mean_file_; - bool has_mean_values_; - bool cache_images_; - vector > image_database_cache_; +template +class WindowDataLayer: public BasePrefetchingDataLayer { + public: + explicit WindowDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~WindowDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "WindowData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + virtual unsigned int PrefetchRand(); + virtual void InternalThreadEntry(); + + shared_ptr prefetch_rng_; + vector > > image_database_; + enum WindowField { + IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM + }; + vector > fg_windows_; + vector > bg_windows_; + Blob data_mean_; + vector mean_values_; + bool has_mean_file_; + bool has_mean_values_; + bool cache_images_; + vector > image_database_cache_; }; } // namespace caffe diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp index 0ad68c80..94c32366 100644 --- a/include/caffe/data_transformer.hpp +++ b/include/caffe/data_transformer.hpp @@ -13,136 +13,136 @@ namespace caffe { * @brief Applies common transformations to the input data, such as * scaling, mirroring, substracting the image mean... */ -template +template class DataTransformer { - public: - explicit DataTransformer(const TransformationParameter& param, Phase phase); - virtual ~DataTransformer() {} - - /** - * @brief Initialize the Random number generations if needed by the - * transformation. - */ - void InitRand(); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to the data. - * - * @param datum - * Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See data_layer.cpp for an example. - */ - void Transform(const Datum& datum, Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Datum. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & datum_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Mat. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & mat_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a cv::Mat - * - * @param cv_img - * cv::Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See image_data_layer.cpp for an example. - */ - void Transform(const cv::Mat& cv_img, Blob* transformed_blob); - - /** - * @brief Applies the same transformation defined in the data layer's - * transform_param block to all the num images in a input_blob. - * - * @param input_blob - * A Blob containing the data to be transformed. It applies the same - * transformation to all the num images in the blob. - * @param transformed_blob - * This is destination blob, it will contain as many images as the - * input blob. It can be part of top blob's data. - */ - void Transform(Blob* input_blob, Blob* transformed_blob); - - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param datum - * Datum containing the data to be transformed. - */ - vector InferBlobShape(const Datum& datum); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - */ - vector InferBlobShape(const vector & datum_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - */ - vector InferBlobShape(const vector & mat_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param cv_img - * cv::Mat containing the data to be transformed. - */ - vector InferBlobShape(const cv::Mat& cv_img); - - protected: - /** - * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). - * - * @param n - * The upperbound (exclusive) value of the random number. - * @return - * A uniformly random integer value from ({0, 1, ..., n-1}). - */ - virtual int Rand(int n); - - void Transform(const Datum& datum, Dtype* transformed_data); - // Tranformation parameters - TransformationParameter param_; - - - shared_ptr rng_; - Phase phase_; - Blob data_mean_; - vector mean_values_; + public: + explicit DataTransformer(const TransformationParameter& param, Phase phase); + virtual ~DataTransformer() { + } + + /** + * @brief Initialize the Random number generations if needed by the + * transformation. + */ + void InitRand(); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to the data. + * + * @param datum + * Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See data_layer.cpp for an example. + */ + void Transform(const Datum& datum, Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Datum. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & datum_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Mat. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & mat_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a cv::Mat + * + * @param cv_img + * cv::Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See image_data_layer.cpp for an example. + */ + void Transform(const cv::Mat& cv_img, Blob* transformed_blob); + + /** + * @brief Applies the same transformation defined in the data layer's + * transform_param block to all the num images in a input_blob. + * + * @param input_blob + * A Blob containing the data to be transformed. It applies the same + * transformation to all the num images in the blob. + * @param transformed_blob + * This is destination blob, it will contain as many images as the + * input blob. It can be part of top blob's data. + */ + void Transform(Blob* input_blob, Blob* transformed_blob); + + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param datum + * Datum containing the data to be transformed. + */ + vector InferBlobShape(const Datum& datum); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + */ + vector InferBlobShape(const vector & datum_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + */ + vector InferBlobShape(const vector & mat_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param cv_img + * cv::Mat containing the data to be transformed. + */ + vector InferBlobShape(const cv::Mat& cv_img); + + protected: + /** + * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). + * + * @param n + * The upperbound (exclusive) value of the random number. + * @return + * A uniformly random integer value from ({0, 1, ..., n-1}). + */ + virtual int Rand(int n); + + void Transform(const Datum& datum, Dtype* transformed_data); + // Tranformation parameters + TransformationParameter param_; + + shared_ptr rng_; + Phase phase_; + Blob data_mean_; + vector mean_values_; }; } // namespace caffe diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 3806eeb6..c6cefedc 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -32,45 +32,51 @@ #include "caffe/common.hpp" namespace caffe { -class Device{ -public: - Device():numPlatforms(0),numDevices(0),device_id(INT_MIN){} - ~Device(); - cl_uint numPlatforms; - cl_platform_id * platformIDs; - char platformName[64]; - char openclVersion[64]; - cl_uint numDevices; - cl_device_id * DeviceIDs; - - cl_context Context; - cl_command_queue CommandQueue; - cl_command_queue CommandQueue_helper; - cl_program Program; - cl_device_id * pDevices; - int device_id; +class Device { + public: + Device() + : numPlatforms(0), numDevices(0), device_id(INT_MIN) { + } + ~Device(); + cl_uint numPlatforms; + cl_platform_id * platformIDs; + char platformName[64]; + char openclVersion[64]; + cl_uint numDevices; + cl_device_id * DeviceIDs; - clblasOrder col; - clblasOrder row; - std::map Kernels; - - cl_int Init(int device_id = -1); - cl_int ConvertToString(std::string pFileName,std::string &Str); - void DisplayPlatformInfo(); - void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); + cl_context Context; + cl_command_queue CommandQueue; + cl_command_queue CommandQueue_helper; + cl_program Program; + cl_device_id * pDevices; + int device_id; - void GetDeviceInfo(); - void DeviceQuery(); - int GetDevice(){return device_id;}; - void BuildProgram(std::string kernel_dir); + clblasOrder col; + clblasOrder row; + std::map Kernels; - template - void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str); - template - void appendBitfield(T info, T value, std::string name, std::string &str); - - cl_kernel GetKernel(std::string kernel_name); - void ReleaseKernels(); + cl_int Init(int device_id = -1); + cl_int ConvertToString(std::string pFileName, std::string &Str); + void DisplayPlatformInfo(); + void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); + + void GetDeviceInfo(); + void DeviceQuery(); + int GetDevice() { + return device_id; + } + ; + void BuildProgram(std::string kernel_dir); + + template + void DisplayDeviceInfo(cl_device_id id, cl_device_info name, + std::string str); + template + void appendBitfield(T info, T value, std::string name, std::string &str); + + cl_kernel GetKernel(std::string kernel_name); + void ReleaseKernels(); }; extern std::string buildOption; extern Device amdDevice; diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 888f4a4b..6c47d7aa 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -16,113 +16,121 @@ namespace caffe { /// @brief Fills a Blob with constant or randomly-generated data. -template +template class Filler { - public: - explicit Filler(const FillerParameter& param) : filler_param_(param) {} - virtual ~Filler() {} - virtual void Fill(Blob* blob) = 0; - protected: - FillerParameter filler_param_; -}; // class Filler - + public: + explicit Filler(const FillerParameter& param) + : filler_param_(param) { + } + virtual ~Filler() { + } + virtual void Fill(Blob* blob) = 0; + protected: + FillerParameter filler_param_; +}; +// class Filler /// @brief Fills a Blob with constant values @f$ x = 0 @f$. -template -class ConstantFiller : public Filler { - public: - explicit ConstantFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - const int count = blob->count(); - const Dtype value = this->filler_param_.value(); - CHECK(count); - for (int i = 0; i < count; ++i) { - data[i] = value; - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class ConstantFiller: public Filler { + public: + explicit ConstantFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + const int count = blob->count(); + const Dtype value = this->filler_param_.value(); + CHECK(count); + for (int i = 0; i < count; ++i) { + data[i] = value; + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$. -template -class UniformFiller : public Filler { - public: - explicit UniformFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class UniformFiller: public Filler { + public: + explicit UniformFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), + Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$. -template -class GaussianFiller : public Filler { - public: - explicit GaussianFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - CHECK(blob->count()); - caffe_rng_gaussian(blob->count(), Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); - int sparse = this->filler_param_.sparse(); - CHECK_GE(sparse, -1); - if (sparse >= 0) { - // Sparse initialization is implemented for "weight" blobs; i.e. matrices. - // These have num == channels == 1; width is number of inputs; height is - // number of outputs. The 'sparse' variable specifies the mean number - // of non-zero input weights for a given output. - CHECK_GE(blob->num_axes(), 1); - const int num_outputs = blob->shape(0); - Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); - rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); - int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); - caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); - for (int i = 0; i < blob->count(); ++i) { - data[i] *= mask[i]; - } - } - } +template +class GaussianFiller: public Filler { + public: + explicit GaussianFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + CHECK(blob->count()); + caffe_rng_gaussian(blob->count(), + Dtype(this->filler_param_.mean()), + Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); + int sparse = this->filler_param_.sparse(); + CHECK_GE(sparse, -1); + if (sparse >= 0) { + // Sparse initialization is implemented for "weight" blobs; i.e. matrices. + // These have num == channels == 1; width is number of inputs; height is + // number of outputs. The 'sparse' variable specifies the mean number + // of non-zero input weights for a given output. + CHECK_GE(blob->num_axes(), 1); + const int num_outputs = blob->shape(0); + Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); + rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); + int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); + caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); + for (int i = 0; i < blob->count(); ++i) { + data[i] *= mask[i]; + } + } + } - protected: - shared_ptr rand_vec_; + protected: + shared_ptr rand_vec_; }; /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$ * such that @f$ \forall i \sum_j x_{ij} = 1 @f$. */ -template -class PositiveUnitballFiller : public Filler { - public: - explicit PositiveUnitballFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - DCHECK(blob->count()); - caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); - // We expect the filler to not be called very frequently, so we will - // just use a simple implementation - int dim = blob->count() / blob->num(); - CHECK(dim); - for (int i = 0; i < blob->num(); ++i) { - Dtype sum = 0; - for (int j = 0; j < dim; ++j) { - sum += data[i * dim + j]; - } - for (int j = 0; j < dim; ++j) { - data[i * dim + j] /= sum; - } - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class PositiveUnitballFiller: public Filler { + public: + explicit PositiveUnitballFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + DCHECK(blob->count()); + caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); + // We expect the filler to not be called very frequently, so we will + // just use a simple implementation + int dim = blob->count() / blob->num(); + CHECK(dim); + for (int i = 0; i < blob->num(); ++i) { + Dtype sum = 0; + for (int j = 0; j < dim; ++j) { + sum += data[i * dim + j]; + } + for (int j = 0; j < dim; ++j) { + data[i * dim + j] /= sum; + } + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -141,29 +149,30 @@ class PositiveUnitballFiller : public Filler { * * TODO(dox): make notation in above comment consistent with rest & use LaTeX. */ -template -class XavierFiller : public Filler { - public: - explicit XavierFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; - } - Dtype scale = sqrt(Dtype(3) / n); - caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class XavierFiller: public Filler { + public: + explicit XavierFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() == + FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() == + FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype scale = sqrt(Dtype(3) / n); + caffe_rng_uniform(blob->count(), -scale, scale, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -183,83 +192,85 @@ class XavierFiller : public Filler { * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this * is currently not the case for inner product layers. */ -template -class MSRAFiller : public Filler { - public: - explicit MSRAFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; - } - Dtype std = sqrt(Dtype(2) / n); - caffe_rng_gaussian(blob->count(), Dtype(0), std, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class MSRAFiller: public Filler { + public: + explicit MSRAFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() == + FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() == + FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype std = sqrt(Dtype(2) / n); + caffe_rng_gaussian(blob->count(), Dtype(0), std, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /*! -@brief Fills a Blob with coefficients for bilinear interpolation. + @brief Fills a Blob with coefficients for bilinear interpolation. -A common use case is with the DeconvolutionLayer acting as upsampling. -You can upsample a feature map with shape of (B, C, H, W) by any integer factor -using the following proto. -\code -layer { - name: "upsample", type: "Deconvolution" - bottom: "{{bottom_name}}" top: "{{top_name}}" - convolution_param { - kernel_size: {{2 * factor - factor % 2}} stride: {{factor}} - num_output: {{C}} group: {{C}} - pad: {{ceil((factor - 1) / 2.)}} - weight_filler: { type: "bilinear" } bias_term: false - } - param { lr_mult: 0 decay_mult: 0 } -} -\endcode -Please use this by replacing `{{}}` with your values. By specifying -`num_output: {{C}} group: {{C}}`, it behaves as -channel-wise convolution. The filter shape of this deconvolution layer will be -(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K) -interpolation kernel for every channel of the filter identically. The resulting -shape of the top feature map will be (B, C, factor * H, factor * W). -Note that the learning rate and the -weight decay are set to 0 in order to keep coefficient values of bilinear -interpolation unchanged during training. If you apply this to an image, this -operation is equivalent to the following call in Python with Scikit.Image. -\code{.py} -out = skimage.transform.rescale(img, factor, mode='constant', cval=0) -\endcode + A common use case is with the DeconvolutionLayer acting as upsampling. + You can upsample a feature map with shape of (B, C, H, W) by any integer factor + using the following proto. + \code + layer { + name: "upsample", type: "Deconvolution" + bottom: "{{bottom_name}}" top: "{{top_name}}" + convolution_param { + kernel_size: {{2 * factor - factor % 2}} stride: {{factor}} + num_output: {{C}} group: {{C}} + pad: {{ceil((factor - 1) / 2.)}} + weight_filler: { type: "bilinear" } bias_term: false + } + param { lr_mult: 0 decay_mult: 0 } + } + \endcode + Please use this by replacing `{{}}` with your values. By specifying + `num_output: {{C}} group: {{C}}`, it behaves as + channel-wise convolution. The filter shape of this deconvolution layer will be + (C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K) + interpolation kernel for every channel of the filter identically. The resulting + shape of the top feature map will be (B, C, factor * H, factor * W). + Note that the learning rate and the + weight decay are set to 0 in order to keep coefficient values of bilinear + interpolation unchanged during training. If you apply this to an image, this + operation is equivalent to the following call in Python with Scikit.Image. + \code{.py} + out = skimage.transform.rescale(img, factor, mode='constant', cval=0) + \endcode */ -template -class BilinearFiller : public Filler { - public: - explicit BilinearFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; - CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; - Dtype* data = blob->mutable_cpu_data(); - int f = ceil(blob->width() / 2.); - float c = (2 * f - 1 - f % 2) / (2. * f); - for (int i = 0; i < blob->count(); ++i) { - float x = i % blob->width(); - float y = (i / blob->width()) % blob->height(); - data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +template +class BilinearFiller: public Filler { + public: + explicit BilinearFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; + CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; + Dtype* data = blob->mutable_cpu_data(); + int f = ceil(blob->width() / 2.); + float c = (2 * f - 1 - f % 2) / (2. * f); + for (int i = 0; i < blob->count(); ++i) { + float x = i % blob->width(); + float y = (i / blob->width()) % blob->height(); + data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -268,27 +279,27 @@ class BilinearFiller : public Filler { * Ideally this would be replaced by a factory pattern, but we will leave it * this way for now. */ -template +template Filler* GetFiller(const FillerParameter& param) { - const std::string& type = param.type(); - if (type == "constant") { - return new ConstantFiller(param); - } else if (type == "gaussian") { - return new GaussianFiller(param); - } else if (type == "positive_unitball") { - return new PositiveUnitballFiller(param); - } else if (type == "uniform") { - return new UniformFiller(param); - } else if (type == "xavier") { - return new XavierFiller(param); - } else if (type == "msra") { - return new MSRAFiller(param); - } else if (type == "bilinear") { - return new BilinearFiller(param); - } else { - CHECK(false) << "Unknown filler name: " << param.type(); - } - return (Filler*)(NULL); + const std::string& type = param.type(); + if (type == "constant") { + return new ConstantFiller(param); + } else if (type == "gaussian") { + return new GaussianFiller(param); + } else if (type == "positive_unitball") { + return new PositiveUnitballFiller(param); + } else if (type == "uniform") { + return new UniformFiller(param); + } else if (type == "xavier") { + return new XavierFiller(param); + } else if (type == "msra") { + return new MSRAFiller(param); + } else if (type == "bilinear") { + return new BilinearFiller(param); + } else { + CHECK(false) << "Unknown filler name: " << param.type(); + } + return (Filler*) (NULL); } } // namespace caffe diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index 815ca546..2df1806e 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -7,7 +7,9 @@ Forward declare boost::thread instead of including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010) on OSX. */ -namespace boost { class thread; } +namespace boost { +class thread; +} namespace caffe { @@ -17,24 +19,27 @@ namespace caffe { * by reimplementing the virutal function InternalThreadEntry. */ class InternalThread { - public: - InternalThread() : thread_() {} - virtual ~InternalThread(); + public: + InternalThread() + : thread_() { + } + virtual ~InternalThread(); - /** Returns true if the thread was successfully started. **/ - bool StartInternalThread(); + /** Returns true if the thread was successfully started. **/ + bool StartInternalThread(); - /** Will not return until the internal thread has exited. */ - bool WaitForInternalThreadToExit(); + /** Will not return until the internal thread has exited. */ + bool WaitForInternalThreadToExit(); - bool is_started() const; + bool is_started() const; - protected: - /* Implement this method in your subclass - with the code you want your thread to run. */ - virtual void InternalThreadEntry() {} + protected: + /* Implement this method in your subclass + with the code you want your thread to run. */ + virtual void InternalThreadEntry() { + } - shared_ptr thread_; + shared_ptr thread_; }; } // namespace caffe diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index e2eba196..b01ea959 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -23,446 +23,475 @@ namespace caffe { * gradients with respect to their input Blob%s, given the error gradients with * their output Blob%s. */ -template +template class Layer { - public: - /** - * You should not implement your own constructor. Any set up code should go - * to SetUp(), where the dimensions of the bottom blobs are provided to the - * layer. - */ - explicit Layer(const LayerParameter& param) - : layer_param_(param) { - // Set phase and copy blobs (if there are any). - phase_ = param.phase(); - if (layer_param_.blobs_size() > 0) { - blobs_.resize(layer_param_.blobs_size()); - for (int i = 0; i < layer_param_.blobs_size(); ++i) { - blobs_[i].reset(new Blob()); - blobs_[i]->FromProto(layer_param_.blobs(i)); - } - } - } - virtual ~Layer() {} - - /** - * @brief Implements common layer setup functionality. - * - * @param bottom the preshaped input blobs - * @param top - * the allocated but unshaped output blobs, to be shaped by Reshape - * - * Checks that the number of bottom and top blobs is correct. - * Calls LayerSetUp to do special layer setup for individual layer types, - * followed by Reshape to set up sizes of top blobs and internal buffers. - * Sets up the loss weight multiplier blobs for any non-zero loss weights. - * This method may not be overridden. - */ - void SetUp(const vector*>& bottom, - const vector*>& top) { - CheckBlobCounts(bottom, top); - LayerSetUp(bottom, top); - Reshape(bottom, top); - SetLossWeights(top); - } - - /** - * @brief Does layer-specific setup: your layer should implement this function - * as well as Reshape. - * - * @param bottom - * the preshaped input blobs, whose data fields store the input data for - * this layer - * @param top - * the allocated but unshaped output blobs - * - * This method should do one-time layer specific setup. This includes reading - * and processing relevent parameters from the layer_param_. - * Setting up the shapes of top blobs and internal buffers should be done in - * Reshape, which will be called before the forward pass to - * adjust the top blob sizes. - */ - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) {} - - /** - * @brief Adjust the shapes of top blobs and internal buffers to accomodate - * the shapes of the bottom blobs. - * - * @param bottom the input blobs, with the requested input shapes - * @param top the top blobs, which should be reshaped as needed - * - * This method should reshape top blobs as needed according to the shapes - * of the bottom (input) blobs, as well as reshaping any internal buffers - * and making any other necessary adjustments so that the layer can - * accomodate the bottom blobs. - */ - virtual void Reshape(const vector*>& bottom, - const vector*>& top) = 0; - - /** - * @brief Given the bottom blobs, compute the top blobs and the loss. - * - * @param bottom - * the input blobs, whose data fields store the input data for this layer - * @param top - * the preshaped output blobs, whose data fields will store this layers' - * outputs - * \return The total loss from the layer. - * - * The Forward wrapper calls the relevant device wrapper function - * (Forward_cpu or Forward_gpu) to compute the top blob values given the - * bottom blobs. If the layer has any non-zero loss_weights, the wrapper - * then computes and returns the loss. - * - * Your layer should implement Forward_cpu and (optionally) Forward_gpu. - */ - inline Dtype Forward(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Given the top blob error gradients, compute the bottom blob error - * gradients. - * - * @param top - * the output blobs, whose diff fields store the gradient of the error - * with respect to themselves - * @param propagate_down - * a vector with equal length to bottom, with each index indicating - * whether to propagate the error gradients down to the bottom blob at - * the corresponding index - * @param bottom - * the input blobs, whose diff fields will store the gradient of the error - * with respect to themselves after Backward is run - * - * The Backward wrapper calls the relevant device wrapper function - * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the - * top blob diffs. - * - * Your layer should implement Backward_cpu and (optionally) Backward_gpu. - */ - inline void Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom); - - /** - * @brief Returns the vector of learnable parameter blobs. - */ - vector > >& blobs() { - return blobs_; - } - - /** - * @brief Returns the layer parameter. - */ - const LayerParameter& layer_param() const { return layer_param_; } - - /** - * @brief Writes the layer parameter to a protocol buffer - */ - virtual void ToProto(LayerParameter* param, bool write_diff = false); - - /** - * @brief Returns the scalar loss associated with a top blob at a given index. - */ - inline Dtype loss(const int top_index) const { - return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); - } - - /** - * @brief Sets the loss associated with a top blob at a given index. - */ - inline void set_loss(const int top_index, const Dtype value) { - if (loss_.size() <= top_index) { - loss_.resize(top_index + 1, Dtype(0)); - } - loss_[top_index] = value; - } - - /** - * @brief Returns the layer type. - */ - virtual inline const char* type() const { return ""; } - - /** - * @brief Returns the exact number of bottom blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of bottom blobs. - */ - virtual inline int ExactNumBottomBlobs() const { return -1; } - /** - * @brief Returns the minimum number of bottom blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of bottom blobs. - */ - virtual inline int MinBottomBlobs() const { return -1; } - /** - * @brief Returns the maximum number of bottom blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of bottom blobs. - */ - virtual inline int MaxBottomBlobs() const { return -1; } - /** - * @brief Returns the exact number of top blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of top blobs. - */ - virtual inline int ExactNumTopBlobs() const { return -1; } - /** - * @brief Returns the minimum number of top blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of top blobs. - */ - virtual inline int MinTopBlobs() const { return -1; } - /** - * @brief Returns the maximum number of top blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of top blobs. - */ - virtual inline int MaxTopBlobs() const { return -1; } - /** - * @brief Returns true if the layer requires an equal number of bottom and - * top blobs. - * - * This method should be overridden to return true if your layer expects an - * equal number of bottom and top blobs. - */ - virtual inline bool EqualNumBottomTopBlobs() const { return false; } - - /** - * @brief Return whether "anonymous" top blobs are created automatically - * by the layer. - * - * If this method returns true, Net::Init will create enough "anonymous" top - * blobs to fulfill the requirement specified by ExactNumTopBlobs() or - * MinTopBlobs(). - */ - virtual inline bool AutoTopBlobs() const { return false; } - - /** - * @brief Return whether to allow force_backward for a given bottom blob - * index. - * - * If AllowForceBackward(i) == false, we will ignore the force_backward - * setting and backpropagate to blob i only if it needs gradient information - * (as is done when force_backward == false). - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } - - /** - * @brief Specifies whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - * - * You can safely ignore false values and always compute gradients - * for all parameters, but possibly with wasteful computation. - */ - inline bool param_propagate_down(const int param_id) { - return (param_propagate_down_.size() > param_id) ? - param_propagate_down_[param_id] : false; - } - /** - * @brief Sets whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - */ - inline void set_param_propagate_down(const int param_id, const bool value) { - if (param_propagate_down_.size() <= param_id) { - param_propagate_down_.resize(param_id + 1, true); - } - param_propagate_down_[param_id] = value; - } - - - protected: - /** The protobuf that stores the layer parameters */ - LayerParameter layer_param_; - /** The phase: TRAIN or TEST */ - Phase phase_; - /** The vector that stores the learnable parameters as a set of blobs. */ - vector > > blobs_; - /** Vector indicating whether to compute the diff of each param blob. */ - vector param_propagate_down_; - - /** The vector that indicates whether each top blob has a non-zero weight in - * the objective function. */ - vector loss_; - - /** @brief Using the CPU device, compute the layer output. */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) = 0; - /** - * @brief Using the GPU device, compute the layer output. - * Fall back to Forward_cpu() if unavailable. - */ - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // LOG(WARNING) << "Using CPU code as backup."; - return Forward_cpu(bottom, top); - } - - /** - * @brief Using the CPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) = 0; - /** - * @brief Using the GPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - * Fall back to Backward_cpu() if unavailable. - */ - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - // LOG(WARNING) << "Using CPU code as backup."; - Backward_cpu(top, propagate_down, bottom); - } - - /** - * Called by the parent Layer's SetUp to check that the number of bottom - * and top Blobs provided as input match the expected numbers specified by - * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. - */ - virtual void CheckBlobCounts(const vector*>& bottom, - const vector*>& top) { - if (ExactNumBottomBlobs() >= 0) { - CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) - << type() << " Layer takes " << ExactNumBottomBlobs() - << " bottom blob(s) as input."; - } - if (MinBottomBlobs() >= 0) { - CHECK_LE(MinBottomBlobs(), bottom.size()) - << type() << " Layer takes at least " << MinBottomBlobs() - << " bottom blob(s) as input."; - } - if (MaxBottomBlobs() >= 0) { - CHECK_GE(MaxBottomBlobs(), bottom.size()) - << type() << " Layer takes at most " << MaxBottomBlobs() - << " bottom blob(s) as input."; - } - if (ExactNumTopBlobs() >= 0) { - CHECK_EQ(ExactNumTopBlobs(), top.size()) - << type() << " Layer produces " << ExactNumTopBlobs() - << " top blob(s) as output."; - } - if (MinTopBlobs() >= 0) { - CHECK_LE(MinTopBlobs(), top.size()) - << type() << " Layer produces at least " << MinTopBlobs() - << " top blob(s) as output."; - } - if (MaxTopBlobs() >= 0) { - CHECK_GE(MaxTopBlobs(), top.size()) - << type() << " Layer produces at most " << MaxTopBlobs() - << " top blob(s) as output."; - } - if (EqualNumBottomTopBlobs()) { - CHECK_EQ(bottom.size(), top.size()) - << type() << " Layer produces one top blob as output for each " - << "bottom blob input."; - } - } - - /** - * Called by SetUp to initialize the weights associated with any top blobs in - * the loss function. Store non-zero loss weights in the diff blob. - */ - inline void SetLossWeights(const vector*>& top) { - const int num_loss_weights = layer_param_.loss_weight_size(); - if (num_loss_weights) { - CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " - "unspecified or specified once per top blob."; - for (int top_id = 0; top_id < top.size(); ++top_id) { - const Dtype loss_weight = layer_param_.loss_weight(top_id); - if (loss_weight == Dtype(0)) { continue; } - this->set_loss(top_id, loss_weight); - const int count = top[top_id]->count(); - Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); - caffe_set(count, loss_weight, loss_multiplier); - } - } - } - - DISABLE_COPY_AND_ASSIGN(Layer); -}; // class Layer + public: + /** + * You should not implement your own constructor. Any set up code should go + * to SetUp(), where the dimensions of the bottom blobs are provided to the + * layer. + */ + explicit Layer(const LayerParameter& param) + : layer_param_(param) { + // Set phase and copy blobs (if there are any). + phase_ = param.phase(); + if (layer_param_.blobs_size() > 0) { + blobs_.resize(layer_param_.blobs_size()); + for (int i = 0; i < layer_param_.blobs_size(); ++i) { + blobs_[i].reset(new Blob()); + blobs_[i]->FromProto(layer_param_.blobs(i)); + } + } + } + virtual ~Layer() { + } + + /** + * @brief Implements common layer setup functionality. + * + * @param bottom the preshaped input blobs + * @param top + * the allocated but unshaped output blobs, to be shaped by Reshape + * + * Checks that the number of bottom and top blobs is correct. + * Calls LayerSetUp to do special layer setup for individual layer types, + * followed by Reshape to set up sizes of top blobs and internal buffers. + * Sets up the loss weight multiplier blobs for any non-zero loss weights. + * This method may not be overridden. + */ + void SetUp(const vector*>& bottom, + const vector*>& top) { + CheckBlobCounts(bottom, top); + LayerSetUp(bottom, top); + Reshape(bottom, top); + SetLossWeights(top); + } + + /** + * @brief Does layer-specific setup: your layer should implement this function + * as well as Reshape. + * + * @param bottom + * the preshaped input blobs, whose data fields store the input data for + * this layer + * @param top + * the allocated but unshaped output blobs + * + * This method should do one-time layer specific setup. This includes reading + * and processing relevent parameters from the layer_param_. + * Setting up the shapes of top blobs and internal buffers should be done in + * Reshape, which will be called before the forward pass to + * adjust the top blob sizes. + */ + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + } + + /** + * @brief Adjust the shapes of top blobs and internal buffers to accomodate + * the shapes of the bottom blobs. + * + * @param bottom the input blobs, with the requested input shapes + * @param top the top blobs, which should be reshaped as needed + * + * This method should reshape top blobs as needed according to the shapes + * of the bottom (input) blobs, as well as reshaping any internal buffers + * and making any other necessary adjustments so that the layer can + * accomodate the bottom blobs. + */ + virtual void Reshape(const vector*>& bottom, + const vector*>& top) = 0; + + /** + * @brief Given the bottom blobs, compute the top blobs and the loss. + * + * @param bottom + * the input blobs, whose data fields store the input data for this layer + * @param top + * the preshaped output blobs, whose data fields will store this layers' + * outputs + * \return The total loss from the layer. + * + * The Forward wrapper calls the relevant device wrapper function + * (Forward_cpu or Forward_gpu) to compute the top blob values given the + * bottom blobs. If the layer has any non-zero loss_weights, the wrapper + * then computes and returns the loss. + * + * Your layer should implement Forward_cpu and (optionally) Forward_gpu. + */ + inline Dtype Forward(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Given the top blob error gradients, compute the bottom blob error + * gradients. + * + * @param top + * the output blobs, whose diff fields store the gradient of the error + * with respect to themselves + * @param propagate_down + * a vector with equal length to bottom, with each index indicating + * whether to propagate the error gradients down to the bottom blob at + * the corresponding index + * @param bottom + * the input blobs, whose diff fields will store the gradient of the error + * with respect to themselves after Backward is run + * + * The Backward wrapper calls the relevant device wrapper function + * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the + * top blob diffs. + * + * Your layer should implement Backward_cpu and (optionally) Backward_gpu. + */ + inline void Backward(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom); + + /** + * @brief Returns the vector of learnable parameter blobs. + */ + vector > >& blobs() { + return blobs_; + } + + /** + * @brief Returns the layer parameter. + */ + const LayerParameter& layer_param() const { + return layer_param_; + } + + /** + * @brief Writes the layer parameter to a protocol buffer + */ + virtual void ToProto(LayerParameter* param, bool write_diff = false); + + /** + * @brief Returns the scalar loss associated with a top blob at a given index. + */ + inline Dtype loss(const int top_index) const { + return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); + } + + /** + * @brief Sets the loss associated with a top blob at a given index. + */ + inline void set_loss(const int top_index, const Dtype value) { + if (loss_.size() <= top_index) { + loss_.resize(top_index + 1, Dtype(0)); + } + loss_[top_index] = value; + } + + /** + * @brief Returns the layer type. + */ + virtual inline const char* type() const { + return ""; + } + + /** + * @brief Returns the exact number of bottom blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of bottom blobs. + */ + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of bottom blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of bottom blobs. + */ + virtual inline int MinBottomBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of bottom blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of bottom blobs. + */ + virtual inline int MaxBottomBlobs() const { + return -1; + } + /** + * @brief Returns the exact number of top blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of top blobs. + */ + virtual inline int ExactNumTopBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of top blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of top blobs. + */ + virtual inline int MinTopBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of top blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of top blobs. + */ + virtual inline int MaxTopBlobs() const { + return -1; + } + /** + * @brief Returns true if the layer requires an equal number of bottom and + * top blobs. + * + * This method should be overridden to return true if your layer expects an + * equal number of bottom and top blobs. + */ + virtual inline bool EqualNumBottomTopBlobs() const { + return false; + } + + /** + * @brief Return whether "anonymous" top blobs are created automatically + * by the layer. + * + * If this method returns true, Net::Init will create enough "anonymous" top + * blobs to fulfill the requirement specified by ExactNumTopBlobs() or + * MinTopBlobs(). + */ + virtual inline bool AutoTopBlobs() const { + return false; + } + + /** + * @brief Return whether to allow force_backward for a given bottom blob + * index. + * + * If AllowForceBackward(i) == false, we will ignore the force_backward + * setting and backpropagate to blob i only if it needs gradient information + * (as is done when force_backward == false). + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; + } + + /** + * @brief Specifies whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + * + * You can safely ignore false values and always compute gradients + * for all parameters, but possibly with wasteful computation. + */ + inline bool param_propagate_down(const int param_id) { + return + (param_propagate_down_.size() > param_id) ? + param_propagate_down_[param_id] : false; + } + /** + * @brief Sets whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + */ + inline void set_param_propagate_down(const int param_id, const bool value) { + if (param_propagate_down_.size() <= param_id) { + param_propagate_down_.resize(param_id + 1, true); + } + param_propagate_down_[param_id] = value; + } + + protected: + /** The protobuf that stores the layer parameters */ + LayerParameter layer_param_; + /** The phase: TRAIN or TEST */ + Phase phase_; + /** The vector that stores the learnable parameters as a set of blobs. */ + vector > > blobs_; + /** Vector indicating whether to compute the diff of each param blob. */ + vector param_propagate_down_; + + /** The vector that indicates whether each top blob has a non-zero weight in + * the objective function. */ + vector loss_; + + /** @brief Using the CPU device, compute the layer output. */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) = 0; + /** + * @brief Using the GPU device, compute the layer output. + * Fall back to Forward_cpu() if unavailable. + */ + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // LOG(WARNING) << "Using CPU code as backup."; + return Forward_cpu(bottom, top); + } + + /** + * @brief Using the CPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) = 0; + /** + * @brief Using the GPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + * Fall back to Backward_cpu() if unavailable. + */ + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + // LOG(WARNING) << "Using CPU code as backup."; + Backward_cpu(top, propagate_down, bottom); + } + + /** + * Called by the parent Layer's SetUp to check that the number of bottom + * and top Blobs provided as input match the expected numbers specified by + * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. + */ + virtual void CheckBlobCounts(const vector*>& bottom, + const vector*>& top) { + if (ExactNumBottomBlobs() >= 0) { + CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) + << type() << " Layer takes " << ExactNumBottomBlobs() + << " bottom blob(s) as input."; + } + if (MinBottomBlobs() >= 0) { + CHECK_LE(MinBottomBlobs(), bottom.size()) + << type() << " Layer takes at least " << MinBottomBlobs() + << " bottom blob(s) as input."; + } + if (MaxBottomBlobs() >= 0) { + CHECK_GE(MaxBottomBlobs(), bottom.size()) + << type() << " Layer takes at most " << MaxBottomBlobs() + << " bottom blob(s) as input."; + } + if (ExactNumTopBlobs() >= 0) { + CHECK_EQ(ExactNumTopBlobs(), top.size()) + << type() << " Layer produces " << ExactNumTopBlobs() + << " top blob(s) as output."; + } + if (MinTopBlobs() >= 0) { + CHECK_LE(MinTopBlobs(), top.size()) + << type() << " Layer produces at least " << MinTopBlobs() + << " top blob(s) as output."; + } + if (MaxTopBlobs() >= 0) { + CHECK_GE(MaxTopBlobs(), top.size()) + << type() << " Layer produces at most " << MaxTopBlobs() + << " top blob(s) as output."; + } + if (EqualNumBottomTopBlobs()) { + CHECK_EQ(bottom.size(), top.size()) + << type() << " Layer produces one top blob as output for each " + << "bottom blob input."; + } + } + + /** + * Called by SetUp to initialize the weights associated with any top blobs in + * the loss function. Store non-zero loss weights in the diff blob. + */ + inline void SetLossWeights(const vector*>& top) { + const int num_loss_weights = layer_param_.loss_weight_size(); + if (num_loss_weights) { + CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " + "unspecified or specified once per top blob."; + for (int top_id = 0; top_id < top.size(); ++top_id) { + const Dtype loss_weight = layer_param_.loss_weight(top_id); + if (loss_weight == Dtype(0)) { + continue; + } + this->set_loss(top_id, loss_weight); + const int count = top[top_id]->count(); + Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); + caffe_set(count, loss_weight, loss_multiplier); + } + } + } + + DISABLE_COPY_AND_ASSIGN (Layer); +}; +// class Layer // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these // functions. -template +template inline Dtype Layer::Forward(const vector*>& bottom, - const vector*>& top) { - Dtype loss = 0; - Reshape(bottom, top); - switch (Caffe::mode()) { - case Caffe::CPU: - Forward_cpu(bottom, top); - for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { continue; } - const int count = top[top_id]->count(); - const Dtype* data = top[top_id]->cpu_data(); - const Dtype* loss_weights = top[top_id]->cpu_diff(); - loss += caffe_cpu_dot(count, data, loss_weights); - } - break; - case Caffe::GPU: - Forward_gpu(bottom, top); + const vector*>& top) { + Dtype loss = 0; + Reshape(bottom, top); + switch (Caffe::mode()) { + case Caffe::CPU: + Forward_cpu(bottom, top); + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { + continue; + } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->cpu_data(); + const Dtype* loss_weights = top[top_id]->cpu_diff(); + loss += caffe_cpu_dot(count, data, loss_weights); + } + break; + case Caffe::GPU: + Forward_gpu(bottom, top); #ifndef CPU_ONLY - for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { continue; } - const int count = top[top_id]->count(); - const Dtype* data = top[top_id]->gpu_data(); - const Dtype* loss_weights = top[top_id]->gpu_diff(); - Dtype blob_loss = 0; - caffe_gpu_dot(count, data, loss_weights, &blob_loss); - loss += blob_loss; - } + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { + continue; + } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->gpu_data(); + const Dtype* loss_weights = top[top_id]->gpu_diff(); + Dtype blob_loss = 0; + caffe_gpu_dot(count, data, loss_weights, &blob_loss); + loss += blob_loss; + } #endif - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } - return loss; + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } + return loss; } -template +template inline void Layer::Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - switch (Caffe::mode()) { - case Caffe::CPU: - Backward_cpu(top, propagate_down, bottom); - break; - case Caffe::GPU: - Backward_gpu(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } + const vector& propagate_down, + const vector*>& bottom) { + switch (Caffe::mode()) { + case Caffe::CPU: + Backward_cpu(top, propagate_down, bottom); + break; + case Caffe::GPU: + Backward_gpu(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } } // Serialize LayerParameter to protocol buffer -template +template void Layer::ToProto(LayerParameter* param, bool write_diff) { - param->Clear(); - param->CopyFrom(layer_param_); - param->clear_blobs(); - for (int i = 0; i < blobs_.size(); ++i) { - blobs_[i]->ToProto(param->add_blobs(), write_diff); - } + param->Clear(); + param->CopyFrom(layer_param_); + param->clear_blobs(); + for (int i = 0; i < blobs_.size(); ++i) { + blobs_[i]->ToProto(param->add_blobs(), write_diff); + } } } // namespace caffe diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp index 2fcd9386..e679ae6a 100644 --- a/include/caffe/layer_factory.hpp +++ b/include/caffe/layer_factory.hpp @@ -47,69 +47,68 @@ namespace caffe { -template +template class Layer; -template +template class LayerRegistry { - public: - typedef shared_ptr > (*Creator)(const LayerParameter&); - typedef std::map CreatorRegistry; - - static CreatorRegistry& Registry() { - static CreatorRegistry* g_registry_ = new CreatorRegistry(); - return *g_registry_; - } - - // Adds a creator. - static void AddCreator(const string& type, Creator creator) { - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 0) - << "Layer type " << type << " already registered."; - registry[type] = creator; - } - - // Get a layer using a LayerParameter. - static shared_ptr > CreateLayer(const LayerParameter& param) { - LOG(INFO) << "Creating layer " << param.name(); - const string& type = param.type(); - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type - << " (known types: " << LayerTypeList() << ")"; - return registry[type](param); - } - - private: - // Layer registry should never be instantiated - everything is done with its - // static variables. - LayerRegistry() {} - - static string LayerTypeList() { - CreatorRegistry& registry = Registry(); - string layer_types; - for (typename CreatorRegistry::iterator iter = registry.begin(); - iter != registry.end(); ++iter) { - if (iter != registry.begin()) { - layer_types += ", "; - } - layer_types += iter->first; - } - return layer_types; - } + public: + typedef shared_ptr > (*Creator)(const LayerParameter&); + typedef std::map CreatorRegistry; + + static CreatorRegistry& Registry() { + static CreatorRegistry* g_registry_ = new CreatorRegistry(); + return *g_registry_; + } + + // Adds a creator. + static void AddCreator(const string& type, Creator creator) { + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 0) + << "Layer type " << type << " already registered."; + registry[type] = creator; + } + + // Get a layer using a LayerParameter. + static shared_ptr > CreateLayer(const LayerParameter& param) { + LOG(INFO) << "Creating layer " << param.name(); + const string& type = param.type(); + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type + << " (known types: " << LayerTypeList() << ")"; + return registry[type](param); + } + + private: + // Layer registry should never be instantiated - everything is done with its + // static variables. + LayerRegistry() { + } + + static string LayerTypeList() { + CreatorRegistry& registry = Registry(); + string layer_types; + for (typename CreatorRegistry::iterator iter = registry.begin(); + iter != registry.end(); ++iter) { + if (iter != registry.begin()) { + layer_types += ", "; + } + layer_types += iter->first; + } + return layer_types; + } }; - -template +template class LayerRegisterer { - public: - LayerRegisterer(const string& type, - shared_ptr > (*creator)(const LayerParameter&)) { - // LOG(INFO) << "Registering layer type: " << type; - LayerRegistry::AddCreator(type, creator); - } + public: + LayerRegisterer(const string& type, + shared_ptr > (*creator)(const LayerParameter&)) { + // LOG(INFO) << "Registering layer type: " << type; + LayerRegistry::AddCreator(type, creator); + } }; - #define REGISTER_LAYER_CREATOR(type, creator) \ static LayerRegisterer g_creator_f_##type(#type, creator); \ static LayerRegisterer g_creator_d_##type(#type, creator) \ diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index d1408fd7..9e74ca85 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -19,73 +19,81 @@ const float kLOG_THRESHOLD = 1e-20; * @brief Computes the classification accuracy for a one-of-many * classification task. */ -template -class AccuracyLayer : public Layer { - public: - /** - * @param param provides AccuracyParameter accuracy_param, - * with AccuracyLayer options: - * - top_k (\b optional, default 1). - * Sets the maximum rank @f$ k @f$ at which a prediction is considered - * correct. For example, if @f$ k = 5 @f$, a prediction is counted - * correct if the correct label is among the top 5 predicted labels. - */ - explicit AccuracyLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Accuracy"; } - virtual inline int ExactNumBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$, a Blob with values in - * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of - * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted - * label @f$ \hat{l}_n @f$ given by its maximal index: - * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels @f$ l @f$, an integer-valued Blob with values - * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ - * indicating the correct class label among the @f$ K @f$ classes - * @param top output Blob vector (length 1) - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * the computed accuracy: @f$ - * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} - * @f$, where @f$ - * \delta\{\mathrm{condition}\} = \left\{ - * \begin{array}{lr} - * 1 & \mbox{if condition} \\ +template +class AccuracyLayer: public Layer { + public: + /** + * @param param provides AccuracyParameter accuracy_param, + * with AccuracyLayer options: + * - top_k (\b optional, default 1). + * Sets the maximum rank @f$ k @f$ at which a prediction is considered + * correct. For example, if @f$ k = 5 @f$, a prediction is counted + * correct if the correct label is among the top 5 predicted labels. + */ + explicit AccuracyLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Accuracy"; + } + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$, a Blob with values in + * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of + * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted + * label @f$ \hat{l}_n @f$ given by its maximal index: + * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels @f$ l @f$, an integer-valued Blob with values + * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ + * indicating the correct class label among the @f$ K @f$ classes + * @param top output Blob vector (length 1) + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * the computed accuracy: @f$ + * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} + * @f$, where @f$ + * \delta\{\mathrm{condition}\} = \left\{ + * \begin{array}{lr} + * 1 & \mbox{if condition} \\ * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - - /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < propagate_down.size(); ++i) { - if (propagate_down[i]) { NOT_IMPLEMENTED; } - } - } - - int label_axis_, outer_num_, inner_num_; - - int top_k_; - - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < propagate_down.size(); ++i) { + if (propagate_down[i]) { + NOT_IMPLEMENTED; + } + } + } + + int label_axis_, outer_num_, inner_num_; + + int top_k_; + + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; }; /** @@ -96,33 +104,40 @@ class AccuracyLayer : public Layer { * LossLayers are typically only capable of backpropagating to their first input * -- the predictions. */ -template -class LossLayer : public Layer { - public: - explicit LossLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp( - const vector*>& bottom, const vector*>& top); - virtual void Reshape( - const vector*>& bottom, const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 2; } - - /** - * @brief For convenience and backwards compatibility, instruct the Net to - * automatically allocate a single top Blob for LossLayers, into which - * they output their singleton loss, (even if the user didn't specify - * one in the prototxt, etc.). - */ - virtual inline bool AutoTopBlobs() const { return true; } - virtual inline int ExactNumTopBlobs() const { return 1; } - /** - * We usually cannot backpropagate to the labels; ignore force_backward for - * these inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 1; - } +template +class LossLayer: public Layer { + public: + explicit LossLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp( + const vector*>& bottom, const vector*>& top); + virtual void Reshape( + const vector*>& bottom, const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + + /** + * @brief For convenience and backwards compatibility, instruct the Net to + * automatically allocate a single top Blob for LossLayers, into which + * they output their singleton loss, (even if the user didn't specify + * one in the prototxt, etc.). + */ + virtual inline bool AutoTopBlobs() const { + return true; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + /** + * We usually cannot backpropagate to the labels; ignore force_backward for + * these inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 1; + } }; /** @@ -149,65 +164,70 @@ class LossLayer : public Layer { * d = \left| \left| a_n - b_n \right| \right|_2^2 @f$. * This can be used to train siamese networks. */ -template -class ContrastiveLossLayer : public LossLayer { - public: - explicit ContrastiveLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 3; } - virtual inline const char* type() const { return "ContrastiveLoss"; } - /** - * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate - * to the first two inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 2; - } - - protected: - /// @copydoc ContrastiveLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Contrastive error gradient w.r.t. the inputs. - * - * Computes the gradients with respect to the two input vectors (bottom[0] and - * bottom[1]), but not the similarity label (bottom[2]). - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$a@f$; Backward fills their diff with - * gradients if propagate_down[0] - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$b@f$; Backward fills their diff with gradients if - * propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; // cached for backward pass - Blob dist_sq_; // cached for backward pass - Blob diff_sq_; // tmp storage for gpu forward pass - Blob summer_vec_; // tmp storage for gpu forward pass +template +class ContrastiveLossLayer: public LossLayer { + public: + explicit ContrastiveLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 3; + } + virtual inline const char* type() const { + return "ContrastiveLoss"; + } + /** + * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate + * to the first two inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 2; + } + + protected: + /// @copydoc ContrastiveLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Contrastive error gradient w.r.t. the inputs. + * + * Computes the gradients with respect to the two input vectors (bottom[0] and + * bottom[1]), but not the similarity label (bottom[2]). + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$a@f$; Backward fills their diff with + * gradients if propagate_down[0] + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$b@f$; Backward fills their diff with gradients if + * propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; // cached for backward pass + Blob dist_sq_; // cached for backward pass + Blob diff_sq_; // tmp storage for gpu forward pass + Blob summer_vec_; // tmp storage for gpu forward pass }; /** @@ -236,69 +256,72 @@ class ContrastiveLossLayer : public LossLayer { * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve * linear least squares problems! We use it only as an instructive example.) */ -template -class EuclideanLossLayer : public LossLayer { - public: - explicit EuclideanLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "EuclideanLoss"; } - /** - * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate - * to both inputs -- override to return true and always allow force_backward. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } - - protected: - /// @copydoc EuclideanLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Euclidean error gradient w.r.t. the inputs. - * - * Unlike other children of LossLayer, EuclideanLossLayer \b can compute - * gradients with respect to the label inputs bottom[1] (but still only will - * if propagate_down[1] is set, due to being produced by learnable parameters - * or if force_backward is set). In fact, this layer is "commutative" -- the - * result is the same regardless of the order of the two bottoms. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$\hat{y}@f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial \hat{y}} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) - * @f$ if propagate_down[0] - * -# @f$ (N \times C \times H \times W) @f$ - * the targets @f$y@f$; Backward fills their diff with gradients - * @f$ \frac{\partial E}{\partial y} = - * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) - * @f$ if propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; +template +class EuclideanLossLayer: public LossLayer { + public: + explicit EuclideanLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "EuclideanLoss"; + } + /** + * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate + * to both inputs -- override to return true and always allow force_backward. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; + } + + protected: + /// @copydoc EuclideanLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Euclidean error gradient w.r.t. the inputs. + * + * Unlike other children of LossLayer, EuclideanLossLayer \b can compute + * gradients with respect to the label inputs bottom[1] (but still only will + * if propagate_down[1] is set, due to being produced by learnable parameters + * or if force_backward is set). In fact, this layer is "commutative" -- the + * result is the same regardless of the order of the two bottoms. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$\hat{y}@f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial \hat{y}} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) + * @f$ if propagate_down[0] + * -# @f$ (N \times C \times H \times W) @f$ + * the targets @f$y@f$; Backward fills their diff with gradients + * @f$ \frac{\partial E}{\partial y} = + * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) + * @f$ if propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; }; /** @@ -344,48 +367,51 @@ class EuclideanLossLayer : public LossLayer { * outside the InnerProductLayer and no other losses outside the * HingeLossLayer). */ -template -class HingeLossLayer : public LossLayer { - public: - explicit HingeLossLayer(const LayerParameter& param) - : LossLayer(param) {} - - virtual inline const char* type() const { return "HingeLoss"; } - - protected: - /// @copydoc HingeLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the hinge loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$t@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial t} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class HingeLossLayer: public LossLayer { + public: + explicit HingeLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + + virtual inline const char* type() const { + return "HingeLoss"; + } + + protected: + /// @copydoc HingeLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the hinge loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$t@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial t} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -420,66 +446,75 @@ class HingeLossLayer : public LossLayer { * \log(\hat{p}_{n,k}) * @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$. */ -template -class InfogainLossLayer : public LossLayer { - public: - explicit InfogainLossLayer(const LayerParameter& param) - : LossLayer(param), infogain_() {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should - // be the infogain matrix. (Otherwise the infogain matrix is loaded from a - // file specified by LayerParameter.) - virtual inline int ExactNumBottomBlobs() const { return -1; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int MaxBottomBlobs() const { return 3; } - - virtual inline const char* type() const { return "InfogainLoss"; } - - protected: - /// @copydoc InfogainLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the infogain loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. (The same applies to the infogain matrix, if - * provided as bottom[2] rather than in the layer_param.) - * - * @param top output Blob vector (length 1), providing the error gradient - * with respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels (similarly for propagate_down[2] and the - * infogain matrix, if provided as bottom[2]) - * @param bottom input Blob vector (length 2-3) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - * -# @f$ (1 \times 1 \times K \times K) @f$ - * (\b optional) the information gain matrix -- ignored as its error - * gradient computation is not implemented. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob infogain_; +template +class InfogainLossLayer: public LossLayer { + public: + explicit InfogainLossLayer(const LayerParameter& param) + : LossLayer(param), infogain_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should + // be the infogain matrix. (Otherwise the infogain matrix is loaded from a + // file specified by LayerParameter.) + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MaxBottomBlobs() const { + return 3; + } + + virtual inline const char* type() const { + return "InfogainLoss"; + } + + protected: + /// @copydoc InfogainLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the infogain loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. (The same applies to the infogain matrix, if + * provided as bottom[2] rather than in the layer_param.) + * + * @param top output Blob vector (length 1), providing the error gradient + * with respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels (similarly for propagate_down[2] and the + * infogain matrix, if provided as bottom[2]) + * @param bottom input Blob vector (length 2-3) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + * -# @f$ (1 \times 1 \times K \times K) @f$ + * (\b optional) the information gain matrix -- ignored as its error + * gradient computation is not implemented. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob infogain_; }; /** @@ -511,51 +546,54 @@ class InfogainLossLayer : public LossLayer { * \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n}) * @f$ */ -template -class MultinomialLogisticLossLayer : public LossLayer { - public: - explicit MultinomialLogisticLossLayer(const LayerParameter& param) - : LossLayer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MultinomialLogisticLoss"; } - - protected: - /// @copydoc MultinomialLogisticLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the multinomial logistic loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class MultinomialLogisticLossLayer: public LossLayer { + public: + explicit MultinomialLogisticLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MultinomialLogisticLoss"; + } + + protected: + /// @copydoc MultinomialLogisticLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the multinomial logistic loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -587,72 +625,75 @@ class MultinomialLogisticLossLayer : public LossLayer { * \right] * @f$ */ -template -class SigmoidCrossEntropyLossLayer : public LossLayer { - public: - explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) - : LossLayer(param), - sigmoid_layer_(new SigmoidLayer(param)), - sigmoid_output_(new Blob()) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; } - - protected: - /// @copydoc SigmoidCrossEntropyLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the target inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as gradient computation with respect - * to the targets is not implemented. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$x@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) - * @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// The internal SigmoidLayer used to map predictions to probabilities. - shared_ptr > sigmoid_layer_; - /// sigmoid_output stores the output of the SigmoidLayer. - shared_ptr > sigmoid_output_; - /// bottom vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_bottom_vec_; - /// top vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_top_vec_; +template +class SigmoidCrossEntropyLossLayer: public LossLayer { + public: + explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) + : LossLayer(param), + sigmoid_layer_(new SigmoidLayer(param)), + sigmoid_output_(new Blob()) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SigmoidCrossEntropyLoss"; + } + + protected: + /// @copydoc SigmoidCrossEntropyLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the target inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as gradient computation with respect + * to the targets is not implemented. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$x@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) + * @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// The internal SigmoidLayer used to map predictions to probabilities. + shared_ptr > sigmoid_layer_; + /// sigmoid_output stores the output of the SigmoidLayer. + shared_ptr > sigmoid_output_; + /// bottom vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_bottom_vec_; + /// top vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_top_vec_; }; // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer. -template class SoftmaxLayer; +template class SoftmaxLayer; /** * @brief Computes the multinomial logistic loss for a one-of-many @@ -668,7 +709,7 @@ template class SoftmaxLayer; * -# @f$ (N \times C \times H \times W) @f$ * the predictions @f$ x @f$, a Blob with values in * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of -ss + ss * the @f$ K = CHW @f$ classes. This layer maps these scores to a * probability distribution over classes using the softmax function * @f$ \hat{p}_{nk} = \exp(x_{nk}) / @@ -683,92 +724,101 @@ ss * \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n}) * @f$, for softmax output class probabilites @f$ \hat{p} @f$ */ -template -class SoftmaxWithLossLayer : public LossLayer { - public: - /** - * @param param provides LossParameter loss_param, with options: - * - ignore_label (optional) - * Specify a label value that should be ignored when computing the loss. - * - normalize (optional, default true) - * If true, the loss is normalized by the number of (nonignored) labels - * present; otherwise the loss is simply summed over spatial locations. - */ - explicit SoftmaxWithLossLayer(const LayerParameter& param) - : LossLayer(param) {} - ~SoftmaxWithLossLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SoftmaxWithLoss"; } - virtual inline int ExactNumTopBlobs() const { return -1; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline int MaxTopBlobs() const { return 2; } - - protected: - /// @copydoc SoftmaxWithLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /** - * @brief Computes the softmax loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - void ocl_setup(); - - /// The internal SoftmaxLayer used to map predictions to a distribution. - shared_ptr > softmax_layer_; - /// prob stores the output probability predictions from the SoftmaxLayer. - Blob prob_; - /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_bottom_vec_; - /// top vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_top_vec_; - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; - /// Whether to normalize the loss by the total number of values present - /// (otherwise just by the batch size). - bool normalize_; - - int softmax_axis_, outer_num_, inner_num_; - - protected: - cl_kernel diff_kernel, scal_kernel, softmax_kernel; - cl_mem d_loss; - cl_kernel softmax_loss_fp_kernel; - cl_kernel softmax_loss_bp_kernel; +template +class SoftmaxWithLossLayer: public LossLayer { + public: + /** + * @param param provides LossParameter loss_param, with options: + * - ignore_label (optional) + * Specify a label value that should be ignored when computing the loss. + * - normalize (optional, default true) + * If true, the loss is normalized by the number of (nonignored) labels + * present; otherwise the loss is simply summed over spatial locations. + */ + explicit SoftmaxWithLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + ~SoftmaxWithLossLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SoftmaxWithLoss"; + } + virtual inline int ExactNumTopBlobs() const { + return -1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + /// @copydoc SoftmaxWithLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /** + * @brief Computes the softmax loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + void ocl_setup(); + + /// The internal SoftmaxLayer used to map predictions to a distribution. + shared_ptr > softmax_layer_; + /// prob stores the output probability predictions from the SoftmaxLayer. + Blob prob_; + /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_bottom_vec_; + /// top vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_top_vec_; + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; + /// Whether to normalize the loss by the total number of values present + /// (otherwise just by the batch size). + bool normalize_; + + int softmax_axis_, outer_num_, inner_num_; + + protected: + cl_kernel diff_kernel, scal_kernel, softmax_kernel; + cl_mem d_loss; + cl_kernel softmax_loss_fp_kernel; + cl_kernel softmax_loss_bp_kernel; }; } // namespace caffe diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 5665df1e..68e631a1 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -20,249 +20,268 @@ namespace caffe { * * TODO(dox): more thorough description. */ -template +template class Net { - public: - explicit Net(const NetParameter& param); - explicit Net(const string& param_file, Phase phase); - virtual ~Net() {} + public: + explicit Net(const NetParameter& param); + explicit Net(const string& param_file, Phase phase); + virtual ~Net() { + } - /// @brief Initialize a network with a NetParameter. - void Init(const NetParameter& param); + /// @brief Initialize a network with a NetParameter. + void Init(const NetParameter& param); - /** - * @brief Run Forward with the input Blob%s already fed separately. - * - * You can get the input blobs using input_blobs(). - */ - const vector*>& ForwardPrefilled(Dtype* loss = NULL); + /** + * @brief Run Forward with the input Blob%s already fed separately. + * + * You can get the input blobs using input_blobs(). + */ + const vector*>& ForwardPrefilled(Dtype* loss = NULL); - /** - * The From and To variants of Forward and Backward operate on the - * (topological) ordering by which the net is specified. For general DAG - * networks, note that (1) computing from one layer to another might entail - * extra computation on unrelated branches, and (2) computation starting in - * the middle may be incorrect if all of the layers of a fan-in are not - * included. - */ - Dtype ForwardFromTo(int start, int end); - Dtype ForwardFrom(int start); - Dtype ForwardTo(int end); - /// @brief Run forward using a set of bottom blobs, and return the result. - const vector*>& Forward(const vector* > & bottom, - Dtype* loss = NULL); - /** - * @brief Run forward using a serialized BlobProtoVector and return the - * result as a serialized BlobProtoVector - */ - string Forward(const string& input_blob_protos, Dtype* loss = NULL); + /** + * The From and To variants of Forward and Backward operate on the + * (topological) ordering by which the net is specified. For general DAG + * networks, note that (1) computing from one layer to another might entail + * extra computation on unrelated branches, and (2) computation starting in + * the middle may be incorrect if all of the layers of a fan-in are not + * included. + */ + Dtype ForwardFromTo(int start, int end); + Dtype ForwardFrom(int start); + Dtype ForwardTo(int end); + /// @brief Run forward using a set of bottom blobs, and return the result. + const vector*>& Forward(const vector*> & bottom, + Dtype* loss = NULL); + /** + * @brief Run forward using a serialized BlobProtoVector and return the + * result as a serialized BlobProtoVector + */ + string Forward(const string& input_blob_protos, Dtype* loss = NULL); - /** - * The network backward should take no input and output, since it solely - * computes the gradient w.r.t the parameters, and the data has already been - * provided during the forward pass. - */ - void Backward(); - void BackwardFromTo(int start, int end); - void BackwardFrom(int start); - void BackwardTo(int end); + /** + * The network backward should take no input and output, since it solely + * computes the gradient w.r.t the parameters, and the data has already been + * provided during the forward pass. + */ + void Backward(); + void BackwardFromTo(int start, int end); + void BackwardFrom(int start); + void BackwardTo(int end); - /** - * @brief Reshape all layers from bottom to top. - * - * This is useful to propagate changes to layer sizes without running - * a forward pass, e.g. to compute output feature size. - */ - void Reshape(); + /** + * @brief Reshape all layers from bottom to top. + * + * This is useful to propagate changes to layer sizes without running + * a forward pass, e.g. to compute output feature size. + */ + void Reshape(); - Dtype ForwardBackward(const vector* > & bottom) { - Dtype loss; - Forward(bottom, &loss); - Backward(); - return loss; - } + Dtype ForwardBackward(const vector*> & bottom) { + Dtype loss; + Forward(bottom, &loss); + Backward(); + return loss; + } - /// @brief Updates the network weights based on the diff values computed. - void Update(); + /// @brief Updates the network weights based on the diff values computed. + void Update(); - /** - * @brief For an already initialized net, implicitly copies (i.e., using no - * additional memory) the pre-trained layers from another Net. - */ - void ShareTrainedLayersWith(const Net* other); - // For an already initialized net, CopyTrainedLayersFrom() copies the already - // trained layers from another net parameter instance. - /** - * @brief For an already initialized net, copies the pre-trained layers from - * another Net. - */ - void CopyTrainedLayersFrom(const NetParameter& param); - void CopyTrainedLayersFrom(const string trained_filename); - /// @brief Writes the net to a proto. - void ToProto(NetParameter* param, bool write_diff = false) const; + /** + * @brief For an already initialized net, implicitly copies (i.e., using no + * additional memory) the pre-trained layers from another Net. + */ + void ShareTrainedLayersWith(const Net* other); + // For an already initialized net, CopyTrainedLayersFrom() copies the already + // trained layers from another net parameter instance. + /** + * @brief For an already initialized net, copies the pre-trained layers from + * another Net. + */ + void CopyTrainedLayersFrom(const NetParameter& param); + void CopyTrainedLayersFrom(const string trained_filename); + /// @brief Writes the net to a proto. + void ToProto(NetParameter* param, bool write_diff = false) const; - /// @brief returns the network name. - inline const string& name() const { return name_; } - /// @brief returns the layer names - inline const vector& layer_names() const { return layer_names_; } - /// @brief returns the blob names - inline const vector& blob_names() const { return blob_names_; } - /// @brief returns the blobs - inline const vector > >& blobs() const { - return blobs_; - } - /// @brief returns the layers - inline const vector > >& layers() const { - return layers_; - } - /// @brief returns the phase: TRAIN or TEST - inline Phase phase() const { return phase_; } - /** - * @brief returns the bottom vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& bottom_vecs() const { - return bottom_vecs_; - } - /** - * @brief returns the top vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& top_vecs() const { - return top_vecs_; - } - inline const vector >& bottom_need_backward() const { - return bottom_need_backward_; - } - inline const vector& blob_loss_weights() const { - return blob_loss_weights_; - } - inline const vector& layer_need_backward() const { - return layer_need_backward_; - } - /// @brief returns the parameters - inline const vector > >& params() const { - return params_; - } - /// @brief returns the parameter learning rate multipliers - inline const vector& params_lr() const { return params_lr_; } - inline const vector& params_weight_decay() const { - return params_weight_decay_; - } - const map& param_names_index() const { - return param_names_index_; - } - inline const vector& param_owners() const { return param_owners_; } - /// @brief Input and output blob numbers - inline int num_inputs() const { return net_input_blobs_.size(); } - inline int num_outputs() const { return net_output_blobs_.size(); } - inline const vector*>& input_blobs() const { - return net_input_blobs_; - } - inline const vector*>& output_blobs() const { - return net_output_blobs_; - } - inline const vector& input_blob_indices() const { - return net_input_blob_indices_; - } - inline const vector& output_blob_indices() const { - return net_output_blob_indices_; - } - bool has_blob(const string& blob_name) const; - const shared_ptr > blob_by_name(const string& blob_name) const; - bool has_layer(const string& layer_name) const; - const shared_ptr > layer_by_name(const string& layer_name) const; + /// @brief returns the network name. + inline const string& name() const { + return name_; + } + /// @brief returns the layer names + inline const vector& layer_names() const { + return layer_names_; + } + /// @brief returns the blob names + inline const vector& blob_names() const { + return blob_names_; + } + /// @brief returns the blobs + inline const vector > >& blobs() const { + return blobs_; + } + /// @brief returns the layers + inline const vector > >& layers() const { + return layers_; + } + /// @brief returns the phase: TRAIN or TEST + inline Phase phase() const { + return phase_; + } + /** + * @brief returns the bottom vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& bottom_vecs() const { + return bottom_vecs_; + } + /** + * @brief returns the top vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& top_vecs() const { + return top_vecs_; + } + inline const vector >& bottom_need_backward() const { + return bottom_need_backward_; + } + inline const vector& blob_loss_weights() const { + return blob_loss_weights_; + } + inline const vector& layer_need_backward() const { + return layer_need_backward_; + } + /// @brief returns the parameters + inline const vector > >& params() const { + return params_; + } + /// @brief returns the parameter learning rate multipliers + inline const vector& params_lr() const { + return params_lr_; + } + inline const vector& params_weight_decay() const { + return params_weight_decay_; + } + const map& param_names_index() const { + return param_names_index_; + } + inline const vector& param_owners() const { + return param_owners_; + } + /// @brief Input and output blob numbers + inline int num_inputs() const { + return net_input_blobs_.size(); + } + inline int num_outputs() const { + return net_output_blobs_.size(); + } + inline const vector*>& input_blobs() const { + return net_input_blobs_; + } + inline const vector*>& output_blobs() const { + return net_output_blobs_; + } + inline const vector& input_blob_indices() const { + return net_input_blob_indices_; + } + inline const vector& output_blob_indices() const { + return net_output_blob_indices_; + } + bool has_blob(const string& blob_name) const; + const shared_ptr > blob_by_name(const string& blob_name) const; + bool has_layer(const string& layer_name) const; + const shared_ptr > layer_by_name( + const string& layer_name) const; - void set_debug_info(const bool value) { debug_info_ = value; } + void set_debug_info(const bool value) { + debug_info_ = value; + } - // Helpers for Init. - /** - * @brief Remove layers that the user specified should be excluded given the current - * phase, level, and stage. - */ - static void FilterNet(const NetParameter& param, - NetParameter* param_filtered); - /// @brief return whether NetState state meets NetStateRule rule - static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, - const string& layer_name); + // Helpers for Init. + /** + * @brief Remove layers that the user specified should be excluded given the current + * phase, level, and stage. + */ + static void FilterNet(const NetParameter& param, + NetParameter* param_filtered); + /// @brief return whether NetState state meets NetStateRule rule + static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, + const string& layer_name); - protected: - // Helpers for Init. - /// @brief Append a new input or top blob to the net. - void AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new bottom blob to the net. - int AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new parameter blob to the net. - void AppendParam(const NetParameter& param, const int layer_id, - const int param_id); + protected: + // Helpers for Init. + /// @brief Append a new input or top blob to the net. + void AppendTop(const NetParameter& param, const int layer_id, + const int top_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new bottom blob to the net. + int AppendBottom(const NetParameter& param, const int layer_id, + const int bottom_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new parameter blob to the net. + void AppendParam(const NetParameter& param, const int layer_id, + const int param_id); - /// @brief Helper for displaying debug info in Forward about input Blobs. - void InputDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Forward. - void ForwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Backward. - void BackwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Update. - void UpdateDebugInfo(const int param_id); + /// @brief Helper for displaying debug info in Forward about input Blobs. + void InputDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Forward. + void ForwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Backward. + void BackwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Update. + void UpdateDebugInfo(const int param_id); - /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. - void GetLearningRateAndWeightDecay(); + /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. + void GetLearningRateAndWeightDecay(); - /// @brief The network name - string name_; - /// @brief The phase: TRAIN or TEST - Phase phase_; - /// @brief Individual layers in the net - vector > > layers_; - vector layer_names_; - map layer_names_index_; - vector layer_need_backward_; - /// @brief the blobs storing intermediate results between the layer. - vector > > blobs_; - vector blob_names_; - map blob_names_index_; - vector blob_need_backward_; - /// bottom_vecs stores the vectors containing the input for each layer. - /// They don't actually host the blobs (blobs_ does), so we simply store - /// pointers. - vector*> > bottom_vecs_; - vector > bottom_id_vecs_; - vector > bottom_need_backward_; - /// top_vecs stores the vectors containing the output for each layer - vector*> > top_vecs_; - vector > top_id_vecs_; - /// Vector of weight in the loss (or objective) function of each net blob, - /// indexed by blob_id. - vector blob_loss_weights_; - vector > param_id_vecs_; - vector param_owners_; - vector param_display_names_; - vector > param_layer_indices_; - map param_names_index_; - /// blob indices for the input and the output of the net - vector net_input_blob_indices_; - vector net_output_blob_indices_; - vector*> net_input_blobs_; - vector*> net_output_blobs_; - /// The parameters in the network. - vector > > params_; - /// the learning rate multipliers - vector params_lr_; - /// the weight decay multipliers - vector params_weight_decay_; - /// The bytes of memory used by this net - size_t memory_used_; - /// Whether to compute and display debug info for the net. - bool debug_info_; + /// @brief The network name + string name_; + /// @brief The phase: TRAIN or TEST + Phase phase_; + /// @brief Individual layers in the net + vector > > layers_; + vector layer_names_; + map layer_names_index_; + vector layer_need_backward_; + /// @brief the blobs storing intermediate results between the layer. + vector > > blobs_; + vector blob_names_; + map blob_names_index_; + vector blob_need_backward_; + /// bottom_vecs stores the vectors containing the input for each layer. + /// They don't actually host the blobs (blobs_ does), so we simply store + /// pointers. + vector*> > bottom_vecs_; + vector > bottom_id_vecs_; + vector > bottom_need_backward_; + /// top_vecs stores the vectors containing the output for each layer + vector*> > top_vecs_; + vector > top_id_vecs_; + /// Vector of weight in the loss (or objective) function of each net blob, + /// indexed by blob_id. + vector blob_loss_weights_; + vector > param_id_vecs_; + vector param_owners_; + vector param_display_names_; + vector > param_layer_indices_; + map param_names_index_; + /// blob indices for the input and the output of the net + vector net_input_blob_indices_; + vector net_output_blob_indices_; + vector*> net_input_blobs_; + vector*> net_output_blobs_; + /// The parameters in the network. + vector > > params_; + /// the learning rate multipliers + vector params_lr_; + /// the weight decay multipliers + vector params_weight_decay_; + /// The bytes of memory used by this net + size_t memory_used_; + /// Whether to compute and display debug info for the net. + bool debug_info_; - DISABLE_COPY_AND_ASSIGN(Net); + DISABLE_COPY_AND_ASSIGN (Net); }; - } // namespace caffe #endif // CAFFE_NET_HPP_ diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index cf6d645a..5606ff65 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -22,16 +22,21 @@ namespace caffe { * each element of the output depends only on the corresponding input * element. */ -template -class NeuronLayer : public Layer { - public: - explicit NeuronLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } +template +class NeuronLayer: public Layer { + public: + explicit NeuronLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } }; /** @@ -44,46 +49,53 @@ class NeuronLayer : public Layer { * -# @f$ (N \times C \times H \times W) @f$ * the computed outputs @f$ y = |x| @f$ */ -template -class AbsValLayer : public NeuronLayer { - public: - explicit AbsValLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "AbsVal"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /// @copydoc AbsValLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the absolute value inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \mathrm{sign}(x) \frac{\partial E}{\partial y} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class AbsValLayer: public NeuronLayer { + public: + explicit AbsValLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "AbsVal"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /// @copydoc AbsValLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the absolute value inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \mathrm{sign}(x) \frac{\partial E}{\partial y} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -103,41 +115,44 @@ class AbsValLayer : public NeuronLayer { * \end{array} \right. * @f$ */ -template -class BNLLLayer : public NeuronLayer { - public: - explicit BNLLLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "BNLL"; } - - protected: - /// @copydoc BNLLLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the BNLL inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class BNLLLayer: public NeuronLayer { + public: + explicit BNLLLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "BNLL"; + } + + protected: + /// @copydoc BNLLLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the BNLL inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -151,64 +166,66 @@ class BNLLLayer : public NeuronLayer { * -# @f$ (N \times C \times H \times W) @f$ * the computed outputs @f$ y = |x| @f$ */ -template -class DropoutLayer : public NeuronLayer { - public: - /** - * @param param provides DropoutParameter dropout_param, - * with DropoutLayer options: - * - dropout_ratio (\b optional, default 0.5). - * Sets the probability @f$ p @f$ that any given unit is dropped. - */ - explicit DropoutLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Dropout"; } - virtual ~DropoutLayer(); - void ocl_setup(int bottom_count); - cl_mem MaskMem; - cl_kernel ocl_Kernel_Fwd; - cl_kernel ocl_Kernel_Bwd; - cl_kernel rng_kernel; - - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs. At training time, we have @f$ - * y_{\mbox{train}} = \left\{ - * \begin{array}{ll} - * \frac{x}{1 - p} & \mbox{if } u > p \\ +template +class DropoutLayer: public NeuronLayer { + public: + /** + * @param param provides DropoutParameter dropout_param, + * with DropoutLayer options: + * - dropout_ratio (\b optional, default 0.5). + * Sets the probability @f$ p @f$ that any given unit is dropped. + */ + explicit DropoutLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Dropout"; + } + virtual ~DropoutLayer(); + void ocl_setup(int bottom_count); + cl_mem MaskMem; + cl_kernel ocl_Kernel_Fwd; + cl_kernel ocl_Kernel_Bwd; + cl_kernel rng_kernel; + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs. At training time, we have @f$ + * y_{\mbox{train}} = \left\{ + * \begin{array}{ll} + * \frac{x}{1 - p} & \mbox{if } u > p \\ * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each - * input at each iteration. At test time, we simply have - * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ - Blob rand_vec_; - /// the probability @f$ p @f$ of dropping any input - Dtype threshold_; - /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ - Dtype scale_; - unsigned int uint_thres_; + * \end{array} \right. + * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each + * input at each iteration. At test time, we simply have + * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ + Blob rand_vec_; + /// the probability @f$ p @f$ of dropping any input + Dtype threshold_; + /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ + Dtype scale_; + unsigned int uint_thres_; }; /** @@ -216,63 +233,66 @@ class DropoutLayer : public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and base @f$ \gamma @f$. */ -template -class ExpLayer : public NeuronLayer { - public: - /** - * @param param provides ExpParameter exp_param, - * with ExpLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit ExpLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Exp"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \gamma ^ {\alpha x + \beta} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype inner_scale_, outer_scale_; +template +class ExpLayer: public NeuronLayer { + public: + /** + * @param param provides ExpParameter exp_param, + * with ExpLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit ExpLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Exp"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \gamma ^ {\alpha x + \beta} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype inner_scale_, outer_scale_; }; /** @@ -280,65 +300,68 @@ class ExpLayer : public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and base @f$ \gamma @f$. */ -template -class LogLayer : public NeuronLayer { - public: - /** - * @param param provides LogParameter log_param, - * with LogLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit LogLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Log"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = log_{\gamma}(\alpha x + \beta) - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype base_scale_; - Dtype input_scale_, input_shift_; - Dtype backward_num_scale_; +template +class LogLayer: public NeuronLayer { + public: + /** + * @param param provides LogParameter log_param, + * with LogLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit LogLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Log"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = log_{\gamma}(\alpha x + \beta) + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype base_scale_; + Dtype input_scale_, input_shift_; + Dtype backward_num_scale_; }; /** @@ -346,141 +369,146 @@ class LogLayer : public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and power @f$ \gamma @f$. */ -template -class PowerLayer : public NeuronLayer { - public: - /** - * @param param provides PowerParameter power_param, - * with PowerLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - power (\b optional, default 1) the power @f$ \gamma @f$ - */ - explicit PowerLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Power"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (\alpha x + \beta) ^ \gamma - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the power inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} - * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = - * \frac{\partial E}{\partial y} - * \frac{\alpha \gamma y}{\alpha x + \beta} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief @f$ \gamma @f$ from layer_param_.power_param() - Dtype power_; - /// @brief @f$ \alpha @f$ from layer_param_.power_param() - Dtype scale_; - /// @brief @f$ \beta @f$ from layer_param_.power_param() - Dtype shift_; - /// @brief Result of @f$ \alpha \gamma @f$ - Dtype diff_scale_; +template +class PowerLayer: public NeuronLayer { + public: + /** + * @param param provides PowerParameter power_param, + * with PowerLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - power (\b optional, default 1) the power @f$ \gamma @f$ + */ + explicit PowerLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Power"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (\alpha x + \beta) ^ \gamma + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the power inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} + * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = + * \frac{\partial E}{\partial y} + * \frac{\alpha \gamma y}{\alpha x + \beta} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief @f$ \gamma @f$ from layer_param_.power_param() + Dtype power_; + /// @brief @f$ \alpha @f$ from layer_param_.power_param() + Dtype scale_; + /// @brief @f$ \beta @f$ from layer_param_.power_param() + Dtype shift_; + /// @brief Result of @f$ \alpha \gamma @f$ + Dtype diff_scale_; }; /** * @brief Rectified Linear Unit non-linearity @f$ y = \max(0, x) @f$. * The simple max is fast to compute, and the function does not saturate. */ -template -class ReLULayer : public NeuronLayer { - public: - /** - * @param param provides ReLUParameter relu_param, - * with ReLULayer options: - * - negative_slope (\b optional, default 0). - * the value @f$ \nu @f$ by which negative values are multiplied. - */ - explicit ReLULayer(const LayerParameter& param) - : NeuronLayer(param) { - } - virtual inline const char* type() const { return "ReLU"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \max(0, x) - * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the ReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le 0 \\ +template +class ReLULayer: public NeuronLayer { + public: + /** + * @param param provides ReLUParameter relu_param, + * with ReLULayer options: + * - negative_slope (\b optional, default 0). + * the value @f$ \nu @f$ by which negative values are multiplied. + */ + explicit ReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual inline const char* type() const { + return "ReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \max(0, x) + * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the ReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$ if propagate_down[0], by default. - * If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed gradients are @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ + * \end{array} \right. + * @f$ if propagate_down[0], by default. + * If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed gradients are @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -489,25 +517,25 @@ class ReLULayer : public NeuronLayer { */ template class CuDNNReLULayer : public ReLULayer { - public: - explicit CuDNNReLULayer(const LayerParameter& param) - : ReLULayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNReLULayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNReLULayer(const LayerParameter& param) + : ReLULayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNReLULayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -519,51 +547,54 @@ class CuDNNReLULayer : public ReLULayer { * Note that the gradient vanishes as the values move away from 0. * The ReLULayer is often a better choice for this reason. */ -template -class SigmoidLayer : public NeuronLayer { - public: - explicit SigmoidLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "Sigmoid"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (1 + \exp(-x))^{-1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} y (1 - y) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class SigmoidLayer: public NeuronLayer { + public: + explicit SigmoidLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "Sigmoid"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (1 + \exp(-x))^{-1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} y (1 - y) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -572,25 +603,25 @@ class SigmoidLayer : public NeuronLayer { */ template class CuDNNSigmoidLayer : public SigmoidLayer { - public: - explicit CuDNNSigmoidLayer(const LayerParameter& param) - : SigmoidLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNSigmoidLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNSigmoidLayer(const LayerParameter& param) + : SigmoidLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNSigmoidLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -602,53 +633,56 @@ class CuDNNSigmoidLayer : public SigmoidLayer { * Note that the gradient vanishes as the values move away from 0. * The ReLULayer is often a better choice for this reason. */ -template -class TanHLayer : public NeuronLayer { - public: - explicit TanHLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "TanH"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} - * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) - * = \frac{\partial E}{\partial y} (1 - y^2) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class TanHLayer: public NeuronLayer { + public: + explicit TanHLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "TanH"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} + * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) + * = \frac{\partial E}{\partial y} (1 - y^2) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -657,25 +691,25 @@ class TanHLayer : public NeuronLayer { */ template class CuDNNTanHLayer : public TanHLayer { - public: - explicit CuDNNTanHLayer(const LayerParameter& param) - : TanHLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNTanHLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNTanHLayer(const LayerParameter& param) + : TanHLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNTanHLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -683,48 +717,51 @@ class CuDNNTanHLayer : public TanHLayer { * @brief Tests whether the input exceeds a threshold: outputs 1 for inputs * above threshold; 0 otherwise. */ -template -class ThresholdLayer : public NeuronLayer { - public: - /** - * @param param provides ThresholdParameter threshold_param, - * with ThresholdLayer options: - * - threshold (\b optional, default 0). - * the threshold value @f$ t @f$ to which the input values are compared. - */ - explicit ThresholdLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Threshold"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le t \\ +template +class ThresholdLayer: public NeuronLayer { + public: + /** + * @param param provides ThresholdParameter threshold_param, + * with ThresholdLayer options: + * - threshold (\b optional, default 0). + * the threshold value @f$ t @f$ to which the input values are compared. + */ + explicit ThresholdLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Threshold"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le t \\ * 1 & \mathrm{if} \; x > t - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - NOT_IMPLEMENTED; - } - - Dtype threshold_; + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + NOT_IMPLEMENTED; + } + + Dtype threshold_; }; /** @@ -735,81 +772,84 @@ class ThresholdLayer : public NeuronLayer { * channels. The number of axes of input blob should be greater than or * equal to 2. The 1st axis (0-based) is seen as channels. */ -template -class PReLULayer : public NeuronLayer { - public: - /** - * @param param provides PReLUParameter prelu_param, - * with PReLULayer options: - * - filler (\b optional, FillerParameter, - * default {'type': constant 'value':0.25}). - * - channel_shared (\b optional, default false). - * negative slopes are shared across channels. - */ - explicit PReLULayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "PReLU"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the computed outputs for each channel @f$i@f$ @f$ - * y_i = \max(0, x_i) + a_i \min(0, x_i) - * @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the PReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times ...) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their - * diff with gradients @f$ - * \frac{\partial E}{\partial x_i} = \left\{ - * \begin{array}{lr} - * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ +template +class PReLULayer: public NeuronLayer { + public: + /** + * @param param provides PReLUParameter prelu_param, + * with PReLULayer options: + * - filler (\b optional, FillerParameter, + * default {'type': constant 'value':0.25}). + * - channel_shared (\b optional, default false). + * negative slopes are shared across channels. + */ + explicit PReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "PReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the computed outputs for each channel @f$i@f$ @f$ + * y_i = \max(0, x_i) + a_i \min(0, x_i) + * @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the PReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times ...) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their + * diff with gradients @f$ + * \frac{\partial E}{\partial x_i} = \left\{ + * \begin{array}{lr} + * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - * If param_propagate_down_[0] is true, it fills the diff with gradients - * @f$ - * \frac{\partial E}{\partial a_i} = \left\{ - * \begin{array}{lr} - * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ + * \end{array} \right. + * @f$. + * If param_propagate_down_[0] is true, it fills the diff with gradients + * @f$ + * \frac{\partial E}{\partial a_i} = \left\{ + * \begin{array}{lr} + * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * 0 & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool channel_shared_; - Blob multiplier_; // dot multiplier for backward computation of params - Blob backward_buff_; // temporary buffer for backward computation - Blob bottom_memory_; // memory for in-place computation + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool channel_shared_; + Blob multiplier_; // dot multiplier for backward computation of params + Blob backward_buff_; // temporary buffer for backward computation + Blob bottom_memory_; // memory for in-place computation }; } // namespace caffe diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp index 19cf18c9..653f5e36 100644 --- a/include/caffe/python_layer.hpp +++ b/include/caffe/python_layer.hpp @@ -10,56 +10,59 @@ namespace bp = boost::python; namespace caffe { -template -class PythonLayer : public Layer { - public: - PythonLayer(PyObject* self, const LayerParameter& param) - : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { } +template +class PythonLayer: public Layer { + public: + PythonLayer(PyObject* self, const LayerParameter& param) + : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { + } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("setup")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("setup")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("reshape")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("reshape")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - virtual inline const char* type() const { return "Python"; } + virtual inline const char* type() const { + return "Python"; + } - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("forward")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - try { - self_.attr("backward")(top, propagate_down, bottom); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("forward")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + try { + self_.attr("backward")(top, propagate_down, bottom); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - private: - bp::object self_; + private: + bp::object self_; }; } // namespace caffe diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 79285a4a..688fb99f 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -14,151 +14,169 @@ namespace caffe { * Requires implementation of ApplyUpdate to compute a parameter update * given the current state of the Net parameters. */ -template +template class Solver { - public: - explicit Solver(const SolverParameter& param); - explicit Solver(const string& param_file); - void Init(const SolverParameter& param); - void InitTrainNet(); - void InitTestNets(); - // The main entry of the solver function. In default, iter will be zero. Pass - // in a non-zero iter number to resume training for a pre-trained net. - virtual void Solve(const char* resume_file = NULL); - inline void Solve(const string resume_file) { Solve(resume_file.c_str()); } - void Step(int iters); - // The Restore function implements how one should restore the solver to a - // previously snapshotted state. You should implement the RestoreSolverState() - // function that restores the state from a SolverState protocol buffer. - void Restore(const char* resume_file); - virtual ~Solver() {} - inline shared_ptr > net() { return net_; } - inline const vector > >& test_nets() { - return test_nets_; - } - int iter() { return iter_; } - - protected: - // Make and apply the update value for the current iteration. - virtual void ApplyUpdate() = 0; - // The Solver::Snapshot function implements the basic snapshotting utility - // that stores the learned net. You should implement the SnapshotSolverState() - // function that produces a SolverState protocol buffer that needs to be - // written to disk together with the learned net. - void Snapshot(); - // The test routine - void TestAll(); - void Test(const int test_net_id = 0); - virtual void SnapshotSolverState(SolverState* state) = 0; - virtual void RestoreSolverState(const SolverState& state) = 0; - - void DisplayOutputBlobs(const int net_id); - - SolverParameter param_; - int iter_; - int current_step_; - shared_ptr > net_; - vector > > test_nets_; - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - - DISABLE_COPY_AND_ASSIGN(Solver); + public: + explicit Solver(const SolverParameter& param); + explicit Solver(const string& param_file); + void Init(const SolverParameter& param); + void InitTrainNet(); + void InitTestNets(); + // The main entry of the solver function. In default, iter will be zero. Pass + // in a non-zero iter number to resume training for a pre-trained net. + virtual void Solve(const char* resume_file = NULL); + inline void Solve(const string resume_file) { + Solve(resume_file.c_str()); + } + void Step(int iters); + // The Restore function implements how one should restore the solver to a + // previously snapshotted state. You should implement the RestoreSolverState() + // function that restores the state from a SolverState protocol buffer. + void Restore(const char* resume_file); + virtual ~Solver() { + } + inline shared_ptr > net() { + return net_; + } + inline const vector > >& test_nets() { + return test_nets_; + } + int iter() { + return iter_; + } + + protected: + // Make and apply the update value for the current iteration. + virtual void ApplyUpdate() = 0; + // The Solver::Snapshot function implements the basic snapshotting utility + // that stores the learned net. You should implement the SnapshotSolverState() + // function that produces a SolverState protocol buffer that needs to be + // written to disk together with the learned net. + void Snapshot(); + // The test routine + void TestAll(); + void Test(const int test_net_id = 0); + virtual void SnapshotSolverState(SolverState* state) = 0; + virtual void RestoreSolverState(const SolverState& state) = 0; + + void DisplayOutputBlobs(const int net_id); + + SolverParameter param_; + int iter_; + int current_step_; + shared_ptr > net_; + vector > > test_nets_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (Solver); }; - /** * @brief Optimizes the parameters of a Net using * stochastic gradient descent (SGD) with momentum. */ -template -class SGDSolver : public Solver { - public: - explicit SGDSolver(const SolverParameter& param) - : Solver(param) { PreSolve(); } - explicit SGDSolver(const string& param_file) - : Solver(param_file) { PreSolve(); } - - const vector > >& history() { return history_; } - - protected: - void PreSolve(); - Dtype GetLearningRate(); - virtual void ApplyUpdate(); - virtual void Normalize(int param_id); - virtual void Regularize(int param_id); - virtual void ComputeUpdateValue(int param_id, Dtype rate); - virtual void ClipGradients(); - virtual void SnapshotSolverState(SolverState * state); - virtual void RestoreSolverState(const SolverState& state); - // history maintains the historical momentum data. - // update maintains update related data and is not needed in snapshots. - // temp maintains other information that might be needed in computation - // of gradients/updates and is not needed in snapshots - vector > > history_, update_, temp_; - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - - DISABLE_COPY_AND_ASSIGN(SGDSolver); +template +class SGDSolver: public Solver { + public: + explicit SGDSolver(const SolverParameter& param) + : Solver(param) { + PreSolve(); + } + explicit SGDSolver(const string& param_file) + : Solver(param_file) { + PreSolve(); + } + + const vector > >& history() { + return history_; + } + + protected: + void PreSolve(); + Dtype GetLearningRate(); + virtual void ApplyUpdate(); + virtual void Normalize(int param_id); + virtual void Regularize(int param_id); + virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ClipGradients(); + virtual void SnapshotSolverState(SolverState * state); + virtual void RestoreSolverState(const SolverState& state); + // history maintains the historical momentum data. + // update maintains update related data and is not needed in snapshots. + // temp maintains other information that might be needed in computation + // of gradients/updates and is not needed in snapshots + vector > > history_, update_, temp_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (SGDSolver); }; -template -class NesterovSolver : public SGDSolver { - public: - explicit NesterovSolver(const SolverParameter& param) - : SGDSolver(param) {} - explicit NesterovSolver(const string& param_file) - : SGDSolver(param_file) {} +template +class NesterovSolver: public SGDSolver { + public: + explicit NesterovSolver(const SolverParameter& param) + : SGDSolver(param) { + } + explicit NesterovSolver(const string& param_file) + : SGDSolver(param_file) { + } - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - DISABLE_COPY_AND_ASSIGN(NesterovSolver); + DISABLE_COPY_AND_ASSIGN (NesterovSolver); }; -template -class AdaGradSolver : public SGDSolver { - public: - explicit AdaGradSolver(const SolverParameter& param) - : SGDSolver(param) { constructor_sanity_check(); } - explicit AdaGradSolver(const string& param_file) - : SGDSolver(param_file) { constructor_sanity_check(); } - - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); - void constructor_sanity_check() { - CHECK_EQ(0, this->param_.momentum()) - << "Momentum cannot be used with AdaGrad."; - } - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - DISABLE_COPY_AND_ASSIGN(AdaGradSolver); +template +class AdaGradSolver: public SGDSolver { + public: + explicit AdaGradSolver(const SolverParameter& param) + : SGDSolver(param) { + constructor_sanity_check(); + } + explicit AdaGradSolver(const string& param_file) + : SGDSolver(param_file) { + constructor_sanity_check(); + } + + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + void constructor_sanity_check() { + CHECK_EQ(0, this->param_.momentum()) + << "Momentum cannot be used with AdaGrad."; + } + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + DISABLE_COPY_AND_ASSIGN (AdaGradSolver); }; -template +template Solver* GetSolver(const SolverParameter& param) { - SolverParameter_SolverType type = param.solver_type(); - - switch (type) { - case SolverParameter_SolverType_SGD: - return new SGDSolver(param); - case SolverParameter_SolverType_NESTEROV: - return new NesterovSolver(param); - case SolverParameter_SolverType_ADAGRAD: - return new AdaGradSolver(param); - default: - LOG(FATAL) << "Unknown SolverType: " << type; - } - return (Solver*) NULL; + SolverParameter_SolverType type = param.solver_type(); + + switch (type) { + case SolverParameter_SolverType_SGD: + return new SGDSolver(param); + case SolverParameter_SolverType_NESTEROV: + return new NesterovSolver(param); + case SolverParameter_SolverType_ADAGRAD: + return new AdaGradSolver(param); + default: + LOG(FATAL) << "Unknown SolverType: " << type; + } + return (Solver*) NULL; } } // namespace caffe diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 0fe6546d..0b053a48 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -49,15 +49,14 @@ namespace caffe { // does not seem to create a memory bottleneck here. inline void CaffeMallocHost(void** ptr, size_t size) { - *ptr = malloc(size); - CHECK(*ptr) << "host allocation of size " << size << " failed"; + *ptr = malloc(size); + CHECK(*ptr) << "host allocation of size " << size << " failed"; } inline void CaffeFreeHost(void* ptr) { - free(ptr); + free(ptr); } - /** * @brief Manages memory allocation and synchronization between the host (CPU) * and device (GPU). @@ -65,47 +64,56 @@ inline void CaffeFreeHost(void* ptr) { * TODO(dox): more thorough description. */ class SyncedMemory { - public: - SyncedMemory() - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { - ocl_setup(); - } - explicit SyncedMemory(size_t size) - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { - ocl_setup(); - } + public: + SyncedMemory() + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), + own_cpu_data_(false), data_layer_(false) { + ocl_setup(); + } + explicit SyncedMemory(size_t size) + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), + own_cpu_data_(false), data_layer_(false) { + ocl_setup(); + } - ~SyncedMemory(); - const void* cpu_data(); - void set_cpu_data(void* data); - const void* gpu_data(); - const void* gpu_cache_data(); - void* mutable_cpu_data(); - void* mutable_gpu_data(); - enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; - SyncedHead head() { return head_; } - size_t size() { return size_; } - void set_data_layer(){ data_layer_ = true; } - private: - void ocl_setup(); - protected: - cl_kernel oclmem_kernel; + ~SyncedMemory(); + const void* cpu_data(); + void set_cpu_data(void* data); + const void* gpu_data(); + const void* gpu_cache_data(); + void* mutable_cpu_data(); + void* mutable_gpu_data(); + enum SyncedHead { + UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED + }; + SyncedHead head() { + return head_; + } + size_t size() { + return size_; + } + void set_data_layer() { + data_layer_ = true; + } + private: + void ocl_setup(); + protected: + cl_kernel oclmem_kernel; - private: - void to_cpu(); - void to_gpu(); - void* cpu_ptr_; - void* gpu_ptr_; - void* gpu_cache_ptr_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; - bool data_layer_; - DISABLE_COPY_AND_ASSIGN(SyncedMemory); -}; // class SyncedMemory + private: + void to_cpu(); + void to_gpu(); + void* cpu_ptr_; + void* gpu_ptr_; + void* gpu_cache_ptr_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; + bool data_layer_; + DISABLE_COPY_AND_ASSIGN (SyncedMemory); +}; +// class SyncedMemory -} // namespace caffe +}// namespace caffe #endif // CAFFE_SYNCEDMEM_HPP_ diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp index b4f8f284..179e31ca 100644 --- a/include/caffe/test/test_caffe_main.hpp +++ b/include/caffe/test/test_caffe_main.hpp @@ -15,61 +15,62 @@ using std::cout; using std::endl; #ifdef CMAKE_BUILD - #include "caffe_config.h" +#include "caffe_config.h" #else - #define OPENCL_TEST_DEVICE -1 - #define CMAKE_SOURCE_DIR "src/" - #define EXAMPLES_SOURCE_DIR "examples/" - #define CMAKE_EXT "" +#define OPENCL_TEST_DEVICE -1 +#define CMAKE_SOURCE_DIR "src/" +#define EXAMPLES_SOURCE_DIR "examples/" +#define CMAKE_EXT "" #endif int main(int argc, char** argv); namespace caffe { -template -class MultiDeviceTest : public ::testing::Test { - public: - typedef typename TypeParam::Dtype Dtype; - protected: - MultiDeviceTest() { - Caffe::set_mode(TypeParam::device); - } - virtual ~MultiDeviceTest() {} +template +class MultiDeviceTest: public ::testing::Test { + public: + typedef typename TypeParam::Dtype Dtype; + protected: + MultiDeviceTest() { + Caffe::set_mode(TypeParam::device); + } + virtual ~MultiDeviceTest() { + } }; typedef ::testing::Types TestDtypes; -template +template struct CPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::CPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::CPU; }; -template -class CPUDeviceTest : public MultiDeviceTest > { +template +class CPUDeviceTest: public MultiDeviceTest > { }; #ifdef CPU_ONLY typedef ::testing::Types, - CPUDevice > TestDtypesAndDevices; +CPUDevice > TestDtypesAndDevices; #else -template +template struct GPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::GPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::GPU; }; -template -class GPUDeviceTest : public MultiDeviceTest > { +template +class GPUDeviceTest: public MultiDeviceTest > { }; typedef ::testing::Types, CPUDevice, - GPUDevice, GPUDevice > - TestDtypesAndDevices; + GPUDevice, GPUDevice > +TestDtypesAndDevices; #endif diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index cc5dcbad..07fe69cf 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -15,244 +15,244 @@ namespace caffe { // The gradient checker adds a L2 normalization loss function on top of the // top blobs, and checks the gradient. -template +template class GradientChecker { - public: - // kink and kink_range specify an ignored nonsmooth region of the form - // kink - kink_range <= |feature value| <= kink + kink_range, - // which accounts for all nonsmoothness in use by caffe - GradientChecker(const Dtype stepsize, const Dtype threshold, - const unsigned int seed = 1701, const Dtype kink = 0., - const Dtype kink_range = -1) - : stepsize_(stepsize), threshold_(threshold), seed_(seed), - kink_(kink), kink_range_(kink_range) {} - // Checks the gradient of a layer, with provided bottom layers and top - // layers. - // Note that after the gradient check, we do not guarantee that the data - // stored in the layer parameters and the blobs are unchanged. - void CheckGradient(Layer* layer, const vector*>& bottom, - const vector*>& top, int check_bottom = -1) { - layer->SetUp(bottom, top); - CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1); - } - void CheckGradientExhaustive(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom = -1); + public: + // kink and kink_range specify an ignored nonsmooth region of the form + // kink - kink_range <= |feature value| <= kink + kink_range, + // which accounts for all nonsmoothness in use by caffe + GradientChecker(const Dtype stepsize, const Dtype threshold, + const unsigned int seed = 1701, const Dtype kink = 0., + const Dtype kink_range = -1) + : stepsize_(stepsize), threshold_(threshold), seed_(seed), + kink_(kink), kink_range_(kink_range) { + } + // Checks the gradient of a layer, with provided bottom layers and top + // layers. + // Note that after the gradient check, we do not guarantee that the data + // stored in the layer parameters and the blobs are unchanged. + void CheckGradient(Layer* layer, const vector*>& bottom, + const vector*>& top, int check_bottom = -1) { + layer->SetUp(bottom, top); + CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1); + } + void CheckGradientExhaustive(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom = -1); - // CheckGradientEltwise can be used to test layers that perform element-wise - // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when - // i != j. - void CheckGradientEltwise(Layer* layer, - const vector*>& bottom, const vector*>& top); + // CheckGradientEltwise can be used to test layers that perform element-wise + // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when + // i != j. + void CheckGradientEltwise(Layer* layer, + const vector*>& bottom, const vector*>& top); - void CheckGradientSingle(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom, int top_id, int top_data_id, bool element_wise = false); + void CheckGradientSingle(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom, int top_id, int top_data_id, bool element_wise = false); - // Checks the gradient of a network. This network should not have any data - // layers or loss layers, since the function does not explicitly deal with - // such cases yet. All input blobs and parameter blobs are going to be - // checked, layer-by-layer to avoid numerical problems to accumulate. - void CheckGradientNet(const Net& net, - const vector*>& input); + // Checks the gradient of a network. This network should not have any data + // layers or loss layers, since the function does not explicitly deal with + // such cases yet. All input blobs and parameter blobs are going to be + // checked, layer-by-layer to avoid numerical problems to accumulate. + void CheckGradientNet(const Net& net, + const vector*>& input); - protected: - Dtype GetObjAndGradient(const Layer& layer, - const vector*>& top, int top_id = -1, int top_data_id = -1); - Dtype stepsize_; - Dtype threshold_; - unsigned int seed_; - Dtype kink_; - Dtype kink_range_; + protected: + Dtype GetObjAndGradient(const Layer& layer, + const vector*>& top, int top_id = -1, int top_data_id = -1); + Dtype stepsize_; + Dtype threshold_; + unsigned int seed_; + Dtype kink_; + Dtype kink_range_; }; - -template +template void GradientChecker::CheckGradientSingle(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom, int top_id, int top_data_id, bool element_wise) { - if (element_wise) { - CHECK_EQ(0, layer->blobs().size()); - CHECK_LE(0, top_id); - CHECK_LE(0, top_data_id); - const int top_count = top[top_id]->count(); - for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) { - CHECK_EQ(top_count, bottom[blob_id]->count()); - } - } - // First, figure out what blobs we need to check against, and zero init - // parameter blobs. - vector*> blobs_to_check; - vector propagate_down(bottom.size(), check_bottom < 0); - for (int i = 0; i < layer->blobs().size(); ++i) { - Blob* blob = layer->blobs()[i].get(); - caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); - blobs_to_check.push_back(blob); - } - if (check_bottom < 0) { - for (int i = 0; i < bottom.size(); ++i) { - blobs_to_check.push_back(bottom[i]); - } - } else { - CHECK_LT(check_bottom, bottom.size()); - blobs_to_check.push_back(bottom[check_bottom]); - propagate_down[check_bottom] = true; - } - // Compute the gradient analytically using Backward - Caffe::set_random_seed(seed_); - // Ignore the loss from the layer (it's just the weighted sum of the losses - // from the top blobs, whose gradients we may want to test individually). - layer->Forward(bottom, top); - // Get additional loss from the objective - GetObjAndGradient(*layer, top, top_id, top_data_id); - layer->Backward(top, propagate_down, bottom); - // Store computed gradients for all checked blobs - vector > > - computed_gradient_blobs(blobs_to_check.size()); - for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { - Blob* current_blob = blobs_to_check[blob_id]; - computed_gradient_blobs[blob_id].reset(new Blob()); - computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); - const int count = blobs_to_check[blob_id]->count(); - const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); - Dtype* computed_gradients = - computed_gradient_blobs[blob_id]->mutable_cpu_data(); - caffe_copy(count, diff, computed_gradients); - } - // Compute derivative of top w.r.t. each bottom and parameter input using - // finite differencing. - // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; - for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { - Blob* current_blob = blobs_to_check[blob_id]; - const Dtype* computed_gradients = - computed_gradient_blobs[blob_id]->cpu_data(); - // LOG(ERROR) << "Blob " << blob_id << ": checking " - // << current_blob->count() << " parameters."; - for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { - // For an element-wise layer, we only need to do finite differencing to - // compute the derivative of top[top_id][top_data_id] w.r.t. - // bottom[blob_id][i] only for i == top_data_id. For any other - // i != top_data_id, we know the derivative is 0 by definition, and simply - // check that that's true. - Dtype estimated_gradient = 0; - Dtype positive_objective = 0; - Dtype negative_objective = 0; - if (!element_wise || (feat_id == top_data_id)) { - // Do finite differencing. - // Compute loss with stepsize_ added to input. - current_blob->mutable_cpu_data()[feat_id] += stepsize_; - Caffe::set_random_seed(seed_); - layer->Forward(bottom, top); - positive_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); - // Compute loss with stepsize_ subtracted from input. - current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; - Caffe::set_random_seed(seed_); - layer->Forward(bottom, top); - negative_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); - // Recover original input value. - current_blob->mutable_cpu_data()[feat_id] += stepsize_; - estimated_gradient = (positive_objective - negative_objective) / - stepsize_ / 2.; - } - Dtype computed_gradient = computed_gradients[feat_id]; - Dtype feature = current_blob->cpu_data()[feat_id]; - // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " - // << current_blob->cpu_diff()[feat_id]; - if (kink_ - kink_range_ > fabs(feature) - || fabs(feature) > kink_ + kink_range_) { - // We check relative accuracy, but for too small values, we threshold - // the scale factor by 1. - Dtype scale = std::max( - std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); - EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) - << "debug: (top_id, top_data_id, blob_id, feat_id)=" - << top_id << "," << top_data_id << "," << blob_id << "," << feat_id - << "; feat = " << feature - << "; objective+ = " << positive_objective - << "; objective- = " << negative_objective; - } - // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; - // LOG(ERROR) << "computed gradient: " << computed_gradient - // << " estimated_gradient: " << estimated_gradient; - } - } + const vector*>& bottom, const vector*>& top, + int check_bottom, int top_id, int top_data_id, bool element_wise) { + if (element_wise) { + CHECK_EQ(0, layer->blobs().size()); + CHECK_LE(0, top_id); + CHECK_LE(0, top_data_id); + const int top_count = top[top_id]->count(); + for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) { + CHECK_EQ(top_count, bottom[blob_id]->count()); + } + } + // First, figure out what blobs we need to check against, and zero init + // parameter blobs. + vector*> blobs_to_check; + vector propagate_down(bottom.size(), check_bottom < 0); + for (int i = 0; i < layer->blobs().size(); ++i) { + Blob* blob = layer->blobs()[i].get(); + caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); + blobs_to_check.push_back(blob); + } + if (check_bottom < 0) { + for (int i = 0; i < bottom.size(); ++i) { + blobs_to_check.push_back(bottom[i]); + } + } else { + CHECK_LT(check_bottom, bottom.size()); + blobs_to_check.push_back(bottom[check_bottom]); + propagate_down[check_bottom] = true; + } + // Compute the gradient analytically using Backward + Caffe::set_random_seed(seed_); + // Ignore the loss from the layer (it's just the weighted sum of the losses + // from the top blobs, whose gradients we may want to test individually). + layer->Forward(bottom, top); + // Get additional loss from the objective + GetObjAndGradient(*layer, top, top_id, top_data_id); + layer->Backward(top, propagate_down, bottom); + // Store computed gradients for all checked blobs + vector < shared_ptr > > + computed_gradient_blobs(blobs_to_check.size()); + for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { + Blob* current_blob = blobs_to_check[blob_id]; + computed_gradient_blobs[blob_id].reset(new Blob()); + computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); + const int count = blobs_to_check[blob_id]->count(); + const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); + Dtype* computed_gradients = + computed_gradient_blobs[blob_id]->mutable_cpu_data(); + caffe_copy(count, diff, computed_gradients); + } + // Compute derivative of top w.r.t. each bottom and parameter input using + // finite differencing. + // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; + for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { + Blob* current_blob = blobs_to_check[blob_id]; + const Dtype* computed_gradients = + computed_gradient_blobs[blob_id]->cpu_data(); + // LOG(ERROR) << "Blob " << blob_id << ": checking " + // << current_blob->count() << " parameters."; + for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { + // For an element-wise layer, we only need to do finite differencing to + // compute the derivative of top[top_id][top_data_id] w.r.t. + // bottom[blob_id][i] only for i == top_data_id. For any other + // i != top_data_id, we know the derivative is 0 by definition, and simply + // check that that's true. + Dtype estimated_gradient = 0; + Dtype positive_objective = 0; + Dtype negative_objective = 0; + if (!element_wise || (feat_id == top_data_id)) { + // Do finite differencing. + // Compute loss with stepsize_ added to input. + current_blob->mutable_cpu_data()[feat_id] += stepsize_; + Caffe::set_random_seed(seed_); + layer->Forward(bottom, top); + positive_objective = + GetObjAndGradient(*layer, top, top_id, top_data_id); + // Compute loss with stepsize_ subtracted from input. + current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; + Caffe::set_random_seed(seed_); + layer->Forward(bottom, top); + negative_objective = + GetObjAndGradient(*layer, top, top_id, top_data_id); + // Recover original input value. + current_blob->mutable_cpu_data()[feat_id] += stepsize_; + estimated_gradient = (positive_objective - negative_objective) / + stepsize_ / 2.; + } + Dtype computed_gradient = computed_gradients[feat_id]; + Dtype feature = current_blob->cpu_data()[feat_id]; + // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " + // << current_blob->cpu_diff()[feat_id]; + if (kink_ - kink_range_ > fabs(feature) + || fabs(feature) > kink_ + kink_range_) { + // We check relative accuracy, but for too small values, we threshold + // the scale factor by 1. + Dtype scale = std::max( + std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); + EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) + << "debug: (top_id, top_data_id, blob_id, feat_id)=" + << top_id << "," << top_data_id << "," << blob_id << "," << feat_id + << "; feat = " << feature + << "; objective+ = " << positive_objective + << "; objective- = " << negative_objective; + } + // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; + // LOG(ERROR) << "computed gradient: " << computed_gradient + // << " estimated_gradient: " << estimated_gradient; + } + } } -template +template void GradientChecker::CheckGradientExhaustive(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom) { - layer->SetUp(bottom, top); - CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob."; - // LOG(ERROR) << "Exhaustive Mode."; - for (int i = 0; i < top.size(); ++i) { - // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count(); - for (int j = 0; j < top[i]->count(); ++j) { - // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j; - CheckGradientSingle(layer, bottom, top, check_bottom, i, j); - } - } + const vector*>& bottom, const vector*>& top, + int check_bottom) { + layer->SetUp(bottom, top); + CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob."; + // LOG(ERROR) << "Exhaustive Mode."; + for (int i = 0; i < top.size(); ++i) { + // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count(); + for (int j = 0; j < top[i]->count(); ++j) { + // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j; + CheckGradientSingle(layer, bottom, top, check_bottom, i, j); + } + } } -template +template void GradientChecker::CheckGradientEltwise(Layer* layer, - const vector*>& bottom, const vector*>& top) { - layer->SetUp(bottom, top); - CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob."; - const int check_bottom = -1; - const bool element_wise = true; - for (int i = 0; i < top.size(); ++i) { - for (int j = 0; j < top[i]->count(); ++j) { - CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise); - } - } + const vector*>& bottom, const vector*>& top) { + layer->SetUp(bottom, top); + CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob."; + const int check_bottom = -1; + const bool element_wise = true; + for (int i = 0; i < top.size(); ++i) { + for (int j = 0; j < top[i]->count(); ++j) { + CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise); + } + } } -template +template void GradientChecker::CheckGradientNet( - const Net& net, const vector*>& input) { - const vector > >& layers = net.layers(); - vector*> >& bottom_vecs = net.bottom_vecs(); - vector*> >& top_vecs = net.top_vecs(); - for (int i = 0; i < layers.size(); ++i) { - net.Forward(input); - LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name(); - CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]); - } + const Net& net, const vector*>& input) { + const vector > >& layers = net.layers(); + vector < vector*> > &bottom_vecs = net.bottom_vecs(); + vector < vector*> > &top_vecs = net.top_vecs(); + for (int i = 0; i < layers.size(); ++i) { + net.Forward(input); + LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name(); + CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]); + } } -template +template Dtype GradientChecker::GetObjAndGradient(const Layer& layer, - const vector*>& top, int top_id, int top_data_id) { - Dtype loss = 0; - if (top_id < 0) { - // the loss will be half of the sum of squares of all outputs - for (int i = 0; i < top.size(); ++i) { - Blob* top_blob = top[i]; - const Dtype* top_blob_data = top_blob->cpu_data(); - Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); - int count = top_blob->count(); - for (int j = 0; j < count; ++j) { - loss += top_blob_data[j] * top_blob_data[j]; - } - // set the diff: simply the data. - caffe_copy(top_blob->count(), top_blob_data, top_blob_diff); - } - loss /= 2.; - } else { - // the loss will be the top_data_id-th element in the top_id-th blob. - for (int i = 0; i < top.size(); ++i) { - Blob* top_blob = top[i]; - Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); - caffe_set(top_blob->count(), Dtype(0), top_blob_diff); - } - const Dtype loss_weight = 2; - loss = top[top_id]->cpu_data()[top_data_id] * loss_weight; - top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight; - } - return loss; + const vector*>& top, int top_id, int top_data_id) { + Dtype loss = 0; + if (top_id < 0) { + // the loss will be half of the sum of squares of all outputs + for (int i = 0; i < top.size(); ++i) { + Blob* top_blob = top[i]; + const Dtype* top_blob_data = top_blob->cpu_data(); + Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); + int count = top_blob->count(); + for (int j = 0; j < count; ++j) { + loss += top_blob_data[j] * top_blob_data[j]; + } + // set the diff: simply the data. + caffe_copy(top_blob->count(), top_blob_data, top_blob_diff); + } + loss /= 2.; + } else { + // the loss will be the top_data_id-th element in the top_id-th blob. + for (int i = 0; i < top.size(); ++i) { + Blob* top_blob = top[i]; + Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); + caffe_set(top_blob->count(), Dtype(0), top_blob_diff); + } + const Dtype loss_weight = 2; + loss = top[top_id]->cpu_data()[top_data_id] * loss_weight; + top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight; + } + return loss; } } // namespace caffe diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp index 890f31bf..f5818f6f 100644 --- a/include/caffe/util/benchmark.hpp +++ b/include/caffe/util/benchmark.hpp @@ -8,43 +8,50 @@ namespace caffe { class Timer { - public: - Timer(); - virtual ~Timer(); - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); - virtual float Seconds(); - - inline bool initted() { return initted_; } - inline bool running() { return running_; } - inline bool has_run_at_least_once() { return has_run_at_least_once_; } - - protected: - void Init(); - - bool initted_; - bool running_; - bool has_run_at_least_once_; -#ifndef CPU_ONLY - //cudaEvent_t start_gpu_; - //cudaEvent_t stop_gpu_; + public: + Timer(); + virtual ~Timer(); + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); + virtual float Seconds(); + + inline bool initted() { + return initted_; + } + inline bool running() { + return running_; + } + inline bool has_run_at_least_once() { + return has_run_at_least_once_; + } + + protected: + void Init(); + + bool initted_; + bool running_; + bool has_run_at_least_once_; + #ifndef CPU_ONLY + //cudaEvent_t start_gpu_; + //cudaEvent_t stop_gpu_; #endif - boost::posix_time::ptime start_cpu_; - boost::posix_time::ptime stop_cpu_; - float elapsed_milliseconds_; - float elapsed_microseconds_; + boost::posix_time::ptime start_cpu_; + boost::posix_time::ptime stop_cpu_; + float elapsed_milliseconds_; + float elapsed_microseconds_; }; -class CPUTimer : public Timer { - public: - explicit CPUTimer(); - virtual ~CPUTimer() {} - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); +class CPUTimer: public Timer { + public: + explicit CPUTimer(); + virtual ~CPUTimer() { + } + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); }; } // namespace caffe diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp index b531dd5f..4acca743 100644 --- a/include/caffe/util/cudnn.hpp +++ b/include/caffe/util/cudnn.hpp @@ -15,116 +15,116 @@ } while (0) inline const char* cudnnGetErrorString(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; - } - return "Unknown cudnn status"; + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + } + return "Unknown cudnn status"; } namespace caffe { -namespace cudnn { - -template class dataType; -template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_FLOAT; - static float oneval, zeroval; - static const void *one, *zero; -}; -template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; - static double oneval, zeroval; - static const void *one, *zero; -}; - -template -inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { - CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); -} - -template -inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w, - int stride_n, int stride_c, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, - n, c, h, w, stride_n, stride_c, stride_h, stride_w)); -} - -template -inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w) { - const int stride_w = 1; - const int stride_h = w * stride_w; - const int stride_c = h * stride_h; - const int stride_n = c * stride_c; - setTensor4dDesc(desc, n, c, h, w, - stride_n, stride_c, stride_h, stride_w); -} - -template -inline void createFilterDesc(cudnnFilterDescriptor_t* desc, - int n, int c, int h, int w) { - CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, - n, c, h, w)); -} - -template -inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { - CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); -} - -template -inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, - cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, - int pad_h, int pad_w, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, - pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); -} - -template -inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, - PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, - int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { - switch (poolmethod) { - case PoolingParameter_PoolMethod_MAX: - *mode = CUDNN_POOLING_MAX; - break; - case PoolingParameter_PoolMethod_AVE: - *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); - CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, - pad_h, pad_w, stride_h, stride_w)); -} - -} // namespace cudnn + namespace cudnn { + + template class dataType; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_FLOAT; + static float oneval, zeroval; + static const void *one, *zero; + }; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; + static double oneval, zeroval; + static const void *one, *zero; + }; + + template + inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { + CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w, + int stride_n, int stride_c, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, + n, c, h, w, stride_n, stride_c, stride_h, stride_w)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w) { + const int stride_w = 1; + const int stride_h = w * stride_w; + const int stride_c = h * stride_h; + const int stride_n = c * stride_c; + setTensor4dDesc(desc, n, c, h, w, + stride_n, stride_c, stride_h, stride_w); + } + + template + inline void createFilterDesc(cudnnFilterDescriptor_t* desc, + int n, int c, int h, int w) { + CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); + CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, + n, c, h, w)); + } + + template + inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { + CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); + } + + template + inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, + cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, + int pad_h, int pad_w, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, + pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); + } + + template + inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, + PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, + int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { + switch (poolmethod) { + case PoolingParameter_PoolMethod_MAX: + *mode = CUDNN_POOLING_MAX; + break; + case PoolingParameter_PoolMethod_AVE: + *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); + CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, + pad_h, pad_w, stride_h, stride_w)); + } + + } // namespace cudnn } // namespace caffe diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp index 59ec3d39..a65e3acf 100644 --- a/include/caffe/util/db.hpp +++ b/include/caffe/util/db.hpp @@ -6,43 +6,52 @@ #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" -namespace caffe { namespace db { +namespace caffe { +namespace db { -enum Mode { READ, WRITE, NEW }; +enum Mode { + READ, WRITE, NEW +}; class Cursor { - public: - Cursor() { } - virtual ~Cursor() { } - virtual void SeekToFirst() = 0; - virtual void Next() = 0; - virtual string key() = 0; - virtual string value() = 0; - virtual bool valid() = 0; - - DISABLE_COPY_AND_ASSIGN(Cursor); + public: + Cursor() { + } + virtual ~Cursor() { + } + virtual void SeekToFirst() = 0; + virtual void Next() = 0; + virtual string key() = 0; + virtual string value() = 0; + virtual bool valid() = 0; + + DISABLE_COPY_AND_ASSIGN (Cursor); }; class Transaction { - public: - Transaction() { } - virtual ~Transaction() { } - virtual void Put(const string& key, const string& value) = 0; - virtual void Commit() = 0; - - DISABLE_COPY_AND_ASSIGN(Transaction); + public: + Transaction() { + } + virtual ~Transaction() { + } + virtual void Put(const string& key, const string& value) = 0; + virtual void Commit() = 0; + + DISABLE_COPY_AND_ASSIGN (Transaction); }; class DB { - public: - DB() { } - virtual ~DB() { } - virtual void Open(const string& source, Mode mode) = 0; - virtual void Close() = 0; - virtual Cursor* NewCursor() = 0; - virtual Transaction* NewTransaction() = 0; - - DISABLE_COPY_AND_ASSIGN(DB); + public: + DB() { + } + virtual ~DB() { + } + virtual void Open(const string& source, Mode mode) = 0; + virtual void Close() = 0; + virtual Cursor* NewCursor() = 0; + virtual Transaction* NewTransaction() = 0; + + DISABLE_COPY_AND_ASSIGN (DB); }; DB* GetDB(DataParameter::DB backend); diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp index 10623554..d3716de7 100644 --- a/include/caffe/util/db_leveldb.hpp +++ b/include/caffe/util/db_leveldb.hpp @@ -8,65 +8,86 @@ #include "caffe/util/db.hpp" -namespace caffe { namespace db { +namespace caffe { +namespace db { -class LevelDBCursor : public Cursor { - public: - explicit LevelDBCursor(leveldb::Iterator* iter) - : iter_(iter) { SeekToFirst(); } - ~LevelDBCursor() { delete iter_; } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void Next() { iter_->Next(); } - virtual string key() { return iter_->key().ToString(); } - virtual string value() { return iter_->value().ToString(); } - virtual bool valid() { return iter_->Valid(); } +class LevelDBCursor: public Cursor { + public: + explicit LevelDBCursor(leveldb::Iterator* iter) + : iter_(iter) { + SeekToFirst(); + } + ~LevelDBCursor() { + delete iter_; + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + } + virtual void Next() { + iter_->Next(); + } + virtual string key() { + return iter_->key().ToString(); + } + virtual string value() { + return iter_->value().ToString(); + } + virtual bool valid() { + return iter_->Valid(); + } - private: - leveldb::Iterator* iter_; + private: + leveldb::Iterator* iter_; }; -class LevelDBTransaction : public Transaction { - public: - explicit LevelDBTransaction(leveldb::DB* db) : db_(db) { CHECK_NOTNULL(db_); } - virtual void Put(const string& key, const string& value) { - batch_.Put(key, value); - } - virtual void Commit() { - leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); - CHECK(status.ok()) << "Failed to write batch to leveldb " - << std::endl << status.ToString(); - } +class LevelDBTransaction: public Transaction { + public: + explicit LevelDBTransaction(leveldb::DB* db) + : db_(db) { + CHECK_NOTNULL(db_); + } + virtual void Put(const string& key, const string& value) { + batch_.Put(key, value); + } + virtual void Commit() { + leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); + CHECK(status.ok()) << "Failed to write batch to leveldb " + << std::endl << status.ToString(); + } - private: - leveldb::DB* db_; - leveldb::WriteBatch batch_; + private: + leveldb::DB* db_; + leveldb::WriteBatch batch_; - DISABLE_COPY_AND_ASSIGN(LevelDBTransaction); + DISABLE_COPY_AND_ASSIGN (LevelDBTransaction); }; -class LevelDB : public DB { - public: - LevelDB() : db_(NULL) { } - virtual ~LevelDB() { Close(); } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (db_ != NULL) { - delete db_; - db_ = NULL; - } - } - virtual LevelDBCursor* NewCursor() { - return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); - } - virtual LevelDBTransaction* NewTransaction() { - return new LevelDBTransaction(db_); - } +class LevelDB: public DB { + public: + LevelDB() + : db_(NULL) { + } + virtual ~LevelDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (db_ != NULL) { + delete db_; + db_ = NULL; + } + } + virtual LevelDBCursor* NewCursor() { + return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); + } + virtual LevelDBTransaction* NewTransaction() { + return new LevelDBTransaction(db_); + } - private: - leveldb::DB* db_; + private: + leveldb::DB* db_; }; - } // namespace db } // namespace caffe diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp index cc7c90af..06424c94 100644 --- a/include/caffe/util/db_lmdb.hpp +++ b/include/caffe/util/db_lmdb.hpp @@ -7,82 +7,97 @@ #include "caffe/util/db.hpp" -namespace caffe { namespace db { +namespace caffe { +namespace db { inline void MDB_CHECK(int mdb_status) { - CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); + CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); } -class LMDBCursor : public Cursor { - public: - explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) - : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { - SeekToFirst(); - } - virtual ~LMDBCursor() { - mdb_cursor_close(mdb_cursor_); - mdb_txn_abort(mdb_txn_); - } - virtual void SeekToFirst() { Seek(MDB_FIRST); } - virtual void Next() { Seek(MDB_NEXT); } - virtual string key() { - return string(static_cast(mdb_key_.mv_data), mdb_key_.mv_size); - } - virtual string value() { - return string(static_cast(mdb_value_.mv_data), - mdb_value_.mv_size); - } - virtual bool valid() { return valid_; } +class LMDBCursor: public Cursor { + public: + explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) + : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { + SeekToFirst(); + } + virtual ~LMDBCursor() { + mdb_cursor_close(mdb_cursor_); + mdb_txn_abort(mdb_txn_); + } + virtual void SeekToFirst() { + Seek (MDB_FIRST); + } + virtual void Next() { + Seek (MDB_NEXT); + } + virtual string key() { + return string(static_cast(mdb_key_.mv_data), + mdb_key_.mv_size); + } + virtual string value() { + return string(static_cast(mdb_value_.mv_data), + mdb_value_.mv_size); + } + virtual bool valid() { + return valid_; + } - private: - void Seek(MDB_cursor_op op) { - int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); - if (mdb_status == MDB_NOTFOUND) { - valid_ = false; - } else { - MDB_CHECK(mdb_status); - valid_ = true; - } - } + private: + void Seek(MDB_cursor_op op) { + int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); + if (mdb_status == MDB_NOTFOUND) { + valid_ = false; + } else { + MDB_CHECK(mdb_status); + valid_ = true; + } + } - MDB_txn* mdb_txn_; - MDB_cursor* mdb_cursor_; - MDB_val mdb_key_, mdb_value_; - bool valid_; + MDB_txn* mdb_txn_; + MDB_cursor* mdb_cursor_; + MDB_val mdb_key_, mdb_value_; + bool valid_; }; -class LMDBTransaction : public Transaction { - public: - explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) - : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { } - virtual void Put(const string& key, const string& value); - virtual void Commit() { MDB_CHECK(mdb_txn_commit(mdb_txn_)); } +class LMDBTransaction: public Transaction { + public: + explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) + : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { + } + virtual void Put(const string& key, const string& value); + virtual void Commit() { + MDB_CHECK(mdb_txn_commit(mdb_txn_)); + } - private: - MDB_dbi* mdb_dbi_; - MDB_txn* mdb_txn_; + private: + MDB_dbi* mdb_dbi_; + MDB_txn* mdb_txn_; - DISABLE_COPY_AND_ASSIGN(LMDBTransaction); + DISABLE_COPY_AND_ASSIGN (LMDBTransaction); }; -class LMDB : public DB { - public: - LMDB() : mdb_env_(NULL) { } - virtual ~LMDB() { Close(); } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (mdb_env_ != NULL) { - mdb_dbi_close(mdb_env_, mdb_dbi_); - mdb_env_close(mdb_env_); - mdb_env_ = NULL; - } - } - virtual LMDBCursor* NewCursor(); - virtual LMDBTransaction* NewTransaction(); +class LMDB: public DB { + public: + LMDB() + : mdb_env_(NULL) { + } + virtual ~LMDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (mdb_env_ != NULL) { + mdb_dbi_close(mdb_env_, mdb_dbi_); + mdb_env_close(mdb_env_); + mdb_env_ = NULL; + } + } + virtual LMDBCursor* NewCursor(); + virtual LMDBTransaction* NewTransaction(); - private: - MDB_env* mdb_env_; - MDB_dbi mdb_dbi_; + private: + MDB_env* mdb_env_; + MDB_dbi mdb_dbi_; }; } // namespace db diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index ba9c4aca..fda13567 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -29,79 +29,84 @@ namespace caffe { -template +template void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); -template +template void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); -template +template void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset); -template +template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset); -template +template void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); -template +template void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); - -template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); - -template -void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); - -template -void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum); - -template -void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im, const int img_offset); - -template -void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum); - -template + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); + +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); + +template +void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); + +template +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, int optnum); + +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im, const int img_offset); + +template +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, int optnum); + +template void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, cl_kernel Kernel); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, cl_kernel Kernel); -template +template void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, cl_kernel Kernel); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, cl_kernel Kernel); } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp index 446abb81..4c0d0106 100644 --- a/include/caffe/util/insert_splits.hpp +++ b/include/caffe/util/insert_splits.hpp @@ -12,14 +12,14 @@ namespace caffe { void InsertSplits(const NetParameter& param, NetParameter* param_split); void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param); + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param); string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx); + const int blob_idx); string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx); + const int blob_idx, const int split_idx); } // namespace caffe diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 3a62c3c9..faef67e3 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -19,119 +19,118 @@ namespace caffe { using ::google::protobuf::Message; inline void MakeTempFilename(string* temp_filename) { - temp_filename->clear(); - *temp_filename = "/tmp/caffe_test.XXXXXX"; - char* temp_filename_cstr = new char[temp_filename->size() + 1]; - // NOLINT_NEXT_LINE(runtime/printf) - strcpy(temp_filename_cstr, temp_filename->c_str()); - int fd = mkstemp(temp_filename_cstr); - CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename; - close(fd); - *temp_filename = temp_filename_cstr; - delete[] temp_filename_cstr; + temp_filename->clear(); + *temp_filename = "/tmp/caffe_test.XXXXXX"; + char* temp_filename_cstr = new char[temp_filename->size() + 1]; + // NOLINT_NEXT_LINE(runtime/printf) + strcpy(temp_filename_cstr, temp_filename->c_str()); + int fd = mkstemp(temp_filename_cstr); + CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename; + close(fd); + *temp_filename = temp_filename_cstr; + delete[] temp_filename_cstr; } inline void MakeTempDir(string* temp_dirname) { - temp_dirname->clear(); - *temp_dirname = "/tmp/caffe_test.XXXXXX"; - char* temp_dirname_cstr = new char[temp_dirname->size() + 1]; - // NOLINT_NEXT_LINE(runtime/printf) - strcpy(temp_dirname_cstr, temp_dirname->c_str()); - char* mkdtemp_result = mkdtemp(temp_dirname_cstr); - CHECK(mkdtemp_result != NULL) - << "Failed to create a temporary directory at: " << *temp_dirname; - *temp_dirname = temp_dirname_cstr; - delete[] temp_dirname_cstr; + temp_dirname->clear(); + *temp_dirname = "/tmp/caffe_test.XXXXXX"; + char* temp_dirname_cstr = new char[temp_dirname->size() + 1]; + // NOLINT_NEXT_LINE(runtime/printf) + strcpy(temp_dirname_cstr, temp_dirname->c_str()); + char* mkdtemp_result = mkdtemp(temp_dirname_cstr); + CHECK(mkdtemp_result != NULL) + << "Failed to create a temporary directory at: " << *temp_dirname; + *temp_dirname = temp_dirname_cstr; + delete[] temp_dirname_cstr; } bool ReadProtoFromTextFile(const char* filename, Message* proto); inline bool ReadProtoFromTextFile(const string& filename, Message* proto) { - return ReadProtoFromTextFile(filename.c_str(), proto); + return ReadProtoFromTextFile(filename.c_str(), proto); } inline void ReadProtoFromTextFileOrDie(const char* filename, Message* proto) { - CHECK(ReadProtoFromTextFile(filename, proto)); + CHECK(ReadProtoFromTextFile(filename, proto)); } inline void ReadProtoFromTextFileOrDie(const string& filename, Message* proto) { - ReadProtoFromTextFileOrDie(filename.c_str(), proto); + ReadProtoFromTextFileOrDie(filename.c_str(), proto); } void WriteProtoToTextFile(const Message& proto, const char* filename); inline void WriteProtoToTextFile(const Message& proto, const string& filename) { - WriteProtoToTextFile(proto, filename.c_str()); + WriteProtoToTextFile(proto, filename.c_str()); } bool ReadProtoFromBinaryFile(const char* filename, Message* proto); inline bool ReadProtoFromBinaryFile(const string& filename, Message* proto) { - return ReadProtoFromBinaryFile(filename.c_str(), proto); + return ReadProtoFromBinaryFile(filename.c_str(), proto); } inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) { - CHECK(ReadProtoFromBinaryFile(filename, proto)); + CHECK(ReadProtoFromBinaryFile(filename, proto)); } inline void ReadProtoFromBinaryFileOrDie(const string& filename, - Message* proto) { - ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); + Message* proto) { + ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); } - void WriteProtoToBinaryFile(const Message& proto, const char* filename); inline void WriteProtoToBinaryFile( - const Message& proto, const string& filename) { - WriteProtoToBinaryFile(proto, filename.c_str()); + const Message& proto, const string& filename) { + WriteProtoToBinaryFile(proto, filename.c_str()); } bool ReadFileToDatum(const string& filename, const int label, Datum* datum); inline bool ReadFileToDatum(const string& filename, Datum* datum) { - return ReadFileToDatum(filename, -1, datum); + return ReadFileToDatum(filename, -1, datum); } bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum); + const int height, const int width, const bool is_color, + const std::string & encoding, Datum* datum); inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, Datum* datum) { - return ReadImageToDatum(filename, label, height, width, is_color, - "", datum); + const int height, const int width, const bool is_color, Datum* datum) { + return ReadImageToDatum(filename, label, height, width, is_color, + "", datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, Datum* datum) { - return ReadImageToDatum(filename, label, height, width, true, datum); + const int height, const int width, Datum* datum) { + return ReadImageToDatum(filename, label, height, width, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const bool is_color, Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, is_color, datum); + const bool is_color, Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, is_color, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, true, datum); + Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const std::string & encoding, Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum); + const std::string & encoding, Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum); } bool DecodeDatumNative(Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color); + const int height, const int width, const bool is_color); cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width); + const int height, const int width); cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color); + const bool is_color); cv::Mat ReadImageToCVMat(const string& filename); @@ -140,19 +139,19 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); -template +template void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob); -template +template void hdf5_load_nd_dataset( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob); -template +template void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob); + const hid_t file_id, const string& dataset_name, const Blob& blob); } // namespace caffe diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index b32760aa..0a7fd67f 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -40,156 +40,157 @@ namespace caffe { // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. -template +template void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); // Decaf gpu gemm provides an interface that is almost the same as the cpu // gemm function - following the c convention and calling the fortran-order // gpu code under the hood. -template +template void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); - -template -cl_event caffe_gpu_gemm( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, + Dtype* C, const int offC); /*This is Yuan Gao's sgemm_ex*/ -template +template void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C, const int offset1, const int offset2, const int offset3); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C, const int offset1, const int offset2, const int offset3); - -template +template cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, + Dtype* C, const int offC); -template +template void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); -template +template void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, - const Dtype * x, size_t offx, const Dtype beta, int incx, - Dtype* y, size_t offy, int incy); + const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, + const Dtype * x, size_t offx, const Dtype beta, int incx, + Dtype* y, size_t offy, int incy); -template +template void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); - + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); -template +template void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); + Dtype* Y); -template +template void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); + Dtype* Y); -template +template void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); -template +template void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); -template +template void caffe_copy(const int N, const Dtype *X, Dtype *Y); -template +template void caffe_set(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); inline void caffe_memset(const size_t N, const int alpha, void* X) { - memset(X, alpha, N); // NOLINT(caffe/alt_fn) + memset(X, alpha, N); // NOLINT(caffe/alt_fn) } inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { #ifndef CPU_ONLY - ocl_memset((int*)X, (alpha<<24)|(alpha<<16)|(alpha<<8)|alpha, N); + ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N); #else - NO_GPU; + NO_GPU; #endif } void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); -template +template void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y); -template +template void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); -template +template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); -template -void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X); +template +void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, + Dtype *X); -template +template void caffe_scal(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_sqr(const int N, const Dtype* a, Dtype* y); -template +template void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); //CUDA version, need to be deleted -template +template void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template -void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y); +template +void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, + const Dtype* b, Dtype* y); -template +template void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); //CUDA version, need to be deleted -template +template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); - unsigned int caffe_rng_rand(); -template +template Dtype caffe_nextafter(const Dtype b); -template +template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); // caffe_gpu_rng_uniform with two arguments generates integers in the range @@ -201,54 +202,54 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r); // specification of curandGenerateUniform. With a = 0, b = 1, just calls // curandGenerateUniform; with other limits will shift and scale the outputs // appropriately after calling curandGenerateUniform. -template +template void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); -template +template void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); -template +template void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, int* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); -template +template void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); -template +template void caffe_exp(const int n, const Dtype* a, Dtype* y); -template +template Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); -template +template void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); -template +template int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); -template +template uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, - const Dtype* y); + const Dtype* y); // Returns the sum of the absolute values of the elements of vector x -template +template Dtype caffe_cpu_asum(const int n, const Dtype* x); -template +template void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); // the branchless, type-safe version from // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c template inline char caffe_sign(Dtype val) { - return (Dtype(0) < val) - (val < Dtype(0)); + return (Dtype(0) < val) - (val < Dtype(0)); } // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC @@ -272,7 +273,6 @@ inline char caffe_sign(Dtype val) { template <> \ void caffe_cpu_##name(const int n, const double* x, double* y) - #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ template \ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ @@ -301,53 +301,51 @@ void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); -template +template void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); -template +template void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); -template +template void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); -template +template void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); -template +template void caffe_exp(const int n, const Dtype* a, Dtype* y); - -template +template void caffe_abs(const int n, const Dtype* a, Dtype* y); -template +template void caffe_log(const int n, const Dtype* a, Dtype* y); -template +template Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, - const Dtype* y, const int incy); + const Dtype* y, const int incy); } // namespace caffe - #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp index 3355b665..e0d4d489 100644 --- a/include/caffe/util/mkl_alternate.hpp +++ b/include/caffe/util/mkl_alternate.hpp @@ -81,16 +81,16 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]); // in standard blas. We will simply use a two-step (inefficient, of course) way // to mimic that. inline void cblas_saxpby(const int N, const float alpha, const float* X, - const int incX, const float beta, float* Y, - const int incY) { - cblas_sscal(N, beta, Y, incY); - cblas_saxpy(N, alpha, X, incX, Y, incY); + const int incX, const float beta, float* Y, + const int incY) { + cblas_sscal(N, beta, Y, incY); + cblas_saxpy(N, alpha, X, incX, Y, incY); } inline void cblas_daxpby(const int N, const double alpha, const double* X, - const int incX, const double beta, double* Y, - const int incY) { - cblas_dscal(N, beta, Y, incY); - cblas_daxpy(N, alpha, X, incX, Y, incY); + const int incX, const double beta, double* Y, + const int incY) { + cblas_dscal(N, beta, Y, incY); + cblas_daxpy(N, alpha, X, incX, Y, incY); } #endif // USE_MKL diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 2e56101e..1bd7c8d4 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -29,10 +29,11 @@ namespace caffe { -template +template void ocl_memset(Dtype* buffer, const Dtype value, const int count); -void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count); +void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, + const int count); void eventCallback(cl_event event, cl_int event_status, void * user_data); } // namespace caffe diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index a15b68ff..c4149789 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -31,237 +31,312 @@ namespace caffe { typedef unsigned int uint32_t; -template inline std::string get_dtype_suffix() +template inline std::string get_dtype_suffix() { - dtype x; - const char type = typeid(x).name()[0]; - std::string suffix; - switch(type){ - case 'i': suffix = "_int"; break; - case 'd': suffix = "_double"; break; - case 'f': - default: suffix = "_float"; - } - return suffix; + dtype x; + const char type = typeid(x).name()[0]; + std::string suffix; + switch (type) { + case 'i': + suffix = "_int"; + break; + case 'd': + suffix = "_double"; + break; + case 'f': + default: + suffix = "_float"; + } + return suffix; } -template -void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); +template +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, + const int M_, const int packing_num); -template +template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum); + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum); -template -void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data); +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* bottom_data, Dtype* scale_data); -template +template void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out); -template -void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data); +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* scale, Dtype* data); -template -Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss); +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* prob_data, const Dtype* label, cl_mem d_loss); -template +template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data); -template -void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const Dtype* label); - -template -void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data); - -template -void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask); - -template -void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); - -template -void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff); - -template - void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff); -template +template +void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, + const Dtype* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data); + +template +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask); + +template +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); + +template +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); + +template +void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff); +template void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data); -template -void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff); +template +void SigmoidBackward(const int count, const Dtype* top_diff, + const Dtype* top_data, Dtype* bottom_diff); -template +template void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data); -template -void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff); - -template -void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data); - -template -void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data); - -template -void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data); - -template -void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data); - -template -void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data); - -template -void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ); - -template -void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff); - - -template -void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor); - -template -void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor); - -template -void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data,const int offset_in, Dtype* bottom_diff); - -template -void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope); - -template -void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); - -template -void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y); - -template -void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data); - -template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff); - -template -void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); - -template -void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ); - -template -void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ); - -template +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, + Dtype* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, + const Dtype* bottom_data, Dtype* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data); + +template +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data); + +template +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data); + +template +void StoPoolForwardTest(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, + const int clnum, const int channels_, const int intheight_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff); + +template +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, + Dtype negative_slope); + +template +void ReLUBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); + +template +void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, + const int* MaskMem, const Dtype scale_, Dtype *top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, + const float threshold_, const Dtype scale_, Dtype* bottom_diff); + +template +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, + Dtype threshold); + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); + +template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out); + const int spatial_dim, const Dtype* data, Dtype* out); -template +template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data); + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data); -template -void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out); +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, + Dtype* out); -template +template void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_log(const int count, const Dtype* data, Dtype* out); -template +template void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_add_scalar(const int count, const Dtype data, Dtype* out); -template +template void kernel_exp(const int count, const Dtype* data, Dtype* out); -template +template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum); + const int spatial_dim, const Dtype* data, Dtype* channel_sum); -template -void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data); +template +void kernel_channel_div(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_sum, Dtype* data); -template +template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot); + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot); -template +template void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts); + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts); -template +template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts); + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); -template +template void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); -template +template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); -template -void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale); +template +void LRNFillScale(cl_kernel LFSkernel, const int nthreads, + const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale); -template +template void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out); + Dtype* scale, Dtype negative_beta, Dtype* out); -template +template void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff); -template -void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y); - -template -void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y); - -template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); - -template -void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff); - -template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data); - -template + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff); +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); + +template +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, Dtype *out_data); + +template void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff); + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff); -template +template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask); + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask); -template +template void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff); + const int blob_idx, const int* mask, Dtype* bottom_diff); } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ - // namespace caffe +// namespace caffe diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp index 8f1cf0d1..b59d9a67 100644 --- a/include/caffe/util/rng.hpp +++ b/include/caffe/util/rng.hpp @@ -14,29 +14,30 @@ namespace caffe { typedef boost::mt19937 rng_t; inline rng_t* caffe_rng() { - return static_cast(Caffe::rng_stream().generator()); + return static_cast(Caffe::rng_stream().generator()); } // Fisher–Yates algorithm -template +template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end, - RandomGenerator* gen) { - typedef typename std::iterator_traits::difference_type - difference_type; - typedef typename boost::uniform_int dist_type; - - difference_type length = std::distance(begin, end); - if (length <= 0) return; - - for (difference_type i = length - 1; i > 0; --i) { - dist_type dist(0, i); - std::iter_swap(begin + i, begin + dist(*gen)); - } + RandomGenerator* gen) { + typedef typename std::iterator_traits::difference_type + difference_type; + typedef typename boost::uniform_int dist_type; + + difference_type length = std::distance(begin, end); + if (length <= 0) + return; + + for (difference_type i = length - 1; i > 0; --i) { + dist_type dist(0, i); + std::iter_swap(begin + i, begin + dist(*gen)); + } } -template +template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) { - shuffle(begin, end, caffe_rng()); + shuffle(begin, end, caffe_rng()); } } // namespace caffe diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp index c1f21a0d..d140e029 100644 --- a/include/caffe/util/upgrade_proto.hpp +++ b/include/caffe/util/upgrade_proto.hpp @@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param); // taking its top blob as input. // Error if any of these above layers are not-conv layers. void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad); + NetParameter* param_upgraded_pad); // Upgrade a single V0LayerConnection to the V1LayerParameter format. bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param); + V1LayerParameter* layer_param); V1LayerParameter_LayerType UpgradeV0LayerType(const string& type); @@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param); bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param); bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param); + LayerParameter* layer_param); const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type); @@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param); // Read parameters from a file into a NetParameter proto message. void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 3ee5a779..9b718bd8 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -20,135 +20,150 @@ namespace caffe { * @brief Abstract base class that factors out the BLAS code common to * ConvolutionLayer and DeconvolutionLayer. */ -template -class BaseConvolutionLayer : public Layer { - public: - explicit BaseConvolutionLayer(const LayerParameter& param) - : Layer(param) {} - virtual ~BaseConvolutionLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int MinBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline bool EqualNumBottomTopBlobs() const { return true; } - - protected: - // Helper functions that abstract away the column buffer and gemm arguments. - // The last argument in forward_cpu_gemm is so that we can skip the im2col if - // we just called weight_cpu_gemm with the same input. - void forward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_cpu_bias(Dtype* output, const Dtype* bias); - void backward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output); - void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* - weights); - void backward_cpu_bias(Dtype* bias, const Dtype* input); -//opencl related setup - void ocl_setup(); +template +class BaseConvolutionLayer: public Layer { + public: + explicit BaseConvolutionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~BaseConvolutionLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline bool EqualNumBottomTopBlobs() const { + return true; + } + + protected: + // Helper functions that abstract away the column buffer and gemm arguments. + // The last argument in forward_cpu_gemm is so that we can skip the im2col if + // we just called weight_cpu_gemm with the same input. + void forward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_cpu_bias(Dtype* output, const Dtype* bias); + void backward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output); + void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* + weights); + void backward_cpu_bias(Dtype* bias, const Dtype* input); + //opencl related setup + void ocl_setup(); #ifndef CPU_ONLY - void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_bias(Dtype* output, const Dtype* bias); - void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); - void backward_gpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* col_output); - void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, - Dtype* col_output); - void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* - weights); - void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype* - weights); - void backward_gpu_bias(Dtype* bias, const Dtype* input); -#endif - - // reverse_dimensions should return true iff we are implementing deconv, so - // that conv helpers know which dimensions are which. - virtual bool reverse_dimensions() = 0; - // Compute height_out_ and width_out_ from other parameters. - virtual void compute_output_shape() = 0; - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int num_; - int channels_; - int pad_h_, pad_w_; - int height_, width_; - int group_; - int num_output_; - int height_out_, width_out_; - bool bias_term_; - bool is_1x1_; - - private: - // wrap im2col/col2im so we don't have to remember the (long) argument lists - inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { - im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); - } - inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { - col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); - } + void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_bias(Dtype* output, const Dtype* bias); + void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); + void backward_gpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* + weights); + void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype* + weights); + void backward_gpu_bias(Dtype* bias, const Dtype* input); + #endif + + // reverse_dimensions should return true iff we are implementing deconv, so + // that conv helpers know which dimensions are which. + virtual bool reverse_dimensions() = 0; + // Compute height_out_ and width_out_ from other parameters. + virtual void compute_output_shape() = 0; + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int num_; + int channels_; + int pad_h_, pad_w_; + int height_, width_; + int group_; + int num_output_; + int height_out_, width_out_; + bool bias_term_; + bool is_1x1_; + + private: + // wrap im2col/col2im so we don't have to remember the (long) argument lists + inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { + im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); + } + inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { + col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + } #ifndef CPU_ONLY - inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { - im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0); - } - inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_); - } - protected: - inline void conv_im2col_gpu_opt(const Dtype* data) { - im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2); - } - inline void conv_col2im_gpu_opt( Dtype* data) { - col2im_gpu_opt((Dtype*)transMem, 0, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); -} - private: - inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { - transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2); -} - inline void conv_transpose_gpu(const Dtype* data){ - opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); -} -protected: - inline void gpu_memset(Dtype* data, Dtype value, int count) { - ocl_memset(data, value, count); -} + inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { + im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, + 0); + } + inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { + col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, + bottom_offset_); + } + protected: + inline void conv_im2col_gpu_opt(const Dtype* data) { + im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, + kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2); + } + inline void conv_col2im_gpu_opt(Dtype* data) { + col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, + kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); + } + private: + inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { + transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_, + M_ * opt_num2, opt_num2); + } + inline void conv_transpose_gpu(const Dtype* data) { + opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + } + protected: + inline void gpu_memset(Dtype* data, Dtype value, int count) { + ocl_memset(data, value, count); + } #endif -private: - int conv_out_channels_; - int conv_in_channels_; - int conv_out_spatial_dim_; - int conv_in_height_; - int conv_in_width_; - int kernel_dim_; + private: + int conv_out_channels_; + int conv_in_channels_; + int conv_out_spatial_dim_; + int conv_in_height_; + int conv_in_width_; + int kernel_dim_; - Blob col_buffer_; - Blob bias_multiplier_; + Blob col_buffer_; + Blob bias_multiplier_; //opencl related data structures -protected: - int opt_num2; - int M_, N_, K_; - int weight_offset_; - int col_offset_; - int output_offset_; - int top_offset_, top_offset_opt, bottom_offset_; -public: - static cl_mem subTopMem, transMem; - static size_t subtop_mem_size, trans_mem_size; + protected: + int opt_num2; + int M_, N_, K_; + int weight_offset_; + int col_offset_; + int output_offset_; + int top_offset_, top_offset_opt, bottom_offset_; + public: + static cl_mem subTopMem, transMem; + static size_t subtop_mem_size, trans_mem_size; }; /** @@ -167,62 +182,67 @@ class BaseConvolutionLayer : public Layer { * be filtered. col2im restores the output spatial structure by rolling up * the output channel N' columns of the output matrix. */ -template -class ConvolutionLayer : public BaseConvolutionLayer { - public: - /** - * @param param provides ConvolutionParameter convolution_param, - * with ConvolutionLayer options: - * - num_output. The number of filters. - * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by - * kernel_size for square filters or kernel_h and kernel_w for rectangular - * filters. - * - stride / stride_h / stride_w (\b optional, default 1). The filter - * stride, given by stride_size for equal dimensions or stride_h and stride_w - * for different strides. By default the convolution is dense with stride 1. - * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for - * convolution, given by pad for equal dimensions or pad_h and pad_w for - * different padding. Input padding is computed implicitly instead of - * actually padding. - * - group (\b optional, default 1). The number of filter groups. Group - * convolution is a method for reducing parameterization by selectively - * connecting input and output channels. The input and output channel dimensions must be divisible - * by the number of groups. For group @f$ \geq 1 @f$, the - * convolutional filters' input and output channels are separated s.t. each - * group takes 1 / group of the input channels and makes 1 / group of the - * output channels. Concretely 4 input channels, 8 output channels, and - * 2 groups separate input channels 1-2 and output channels 1-4 into the - * first group and input channels 3-4 and output channels 5-8 into the second - * group. - * - bias_term (\b optional, default true). Whether to have a bias. - * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library - * kernels + stream parallelism) engines. - */ - explicit ConvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) {} - - virtual inline const char* type() const { return "Convolution"; } - -protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { return false; } - virtual void compute_output_shape(); - - virtual void Forward_gpu_org(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt2(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +template +class ConvolutionLayer: public BaseConvolutionLayer { + public: + /** + * @param param provides ConvolutionParameter convolution_param, + * with ConvolutionLayer options: + * - num_output. The number of filters. + * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by + * kernel_size for square filters or kernel_h and kernel_w for rectangular + * filters. + * - stride / stride_h / stride_w (\b optional, default 1). The filter + * stride, given by stride_size for equal dimensions or stride_h and stride_w + * for different strides. By default the convolution is dense with stride 1. + * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for + * convolution, given by pad for equal dimensions or pad_h and pad_w for + * different padding. Input padding is computed implicitly instead of + * actually padding. + * - group (\b optional, default 1). The number of filter groups. Group + * convolution is a method for reducing parameterization by selectively + * connecting input and output channels. The input and output channel dimensions must be divisible + * by the number of groups. For group @f$ \geq 1 @f$, the + * convolutional filters' input and output channels are separated s.t. each + * group takes 1 / group of the input channels and makes 1 / group of the + * output channels. Concretely 4 input channels, 8 output channels, and + * 2 groups separate input channels 1-2 and output channels 1-4 into the + * first group and input channels 3-4 and output channels 5-8 into the second + * group. + * - bias_term (\b optional, default true). Whether to have a bias. + * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library + * kernels + stream parallelism) engines. + */ + explicit ConvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Convolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return false; + } + virtual void compute_output_shape(); + + virtual void Forward_gpu_org(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_org(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Forward_gpu_opt2(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_opt2(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -239,25 +259,30 @@ class ConvolutionLayer : public BaseConvolutionLayer { * padding is removed from the output rather than added to the input, and * stride results in upsampling rather than downsampling). */ -template -class DeconvolutionLayer : public BaseConvolutionLayer { - public: - explicit DeconvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) {} - - virtual inline const char* type() const { return "Deconvolution"; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { return true; } - virtual void compute_output_shape(); +template +class DeconvolutionLayer: public BaseConvolutionLayer { + public: + explicit DeconvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Deconvolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return true; + } + virtual void compute_output_shape(); }; #ifdef USE_CUDNN @@ -274,34 +299,34 @@ class DeconvolutionLayer : public BaseConvolutionLayer { * input and filter regimes the CUDNN engine is faster than the CAFFE engine, * but for fully-convolutional models and large inputs the CAFFE engine can be * faster as long as it fits in memory. -*/ + */ template class CuDNNConvolutionLayer : public ConvolutionLayer { - public: - explicit CuDNNConvolutionLayer(const LayerParameter& param) - : ConvolutionLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNConvolutionLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t* handle_; - cudaStream_t* stream_; - vector bottom_descs_, top_descs_; - cudnnTensorDescriptor_t bias_desc_; - cudnnFilterDescriptor_t filter_desc_; - vector conv_descs_; - int bottom_offset_, top_offset_, weight_offset_, bias_offset_; - size_t workspaceSizeInBytes; - void *workspace; + public: + explicit CuDNNConvolutionLayer(const LayerParameter& param) + : ConvolutionLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNConvolutionLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t* handle_; + cudaStream_t* stream_; + vector bottom_descs_, top_descs_; + cudnnTensorDescriptor_t bias_desc_; + cudnnFilterDescriptor_t filter_desc_; + vector conv_descs_; + int bottom_offset_, top_offset_, weight_offset_, bias_offset_; + size_t workspaceSizeInBytes; + void *workspace; }; #endif @@ -312,163 +337,183 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class Im2colLayer : public Layer { - public: - explicit Im2colLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Im2col"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int channels_; - int height_, width_; - int pad_h_, pad_w_; +template +class Im2colLayer: public Layer { + public: + explicit Im2colLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Im2col"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int channels_; + int height_, width_; + int pad_h_, pad_w_; }; // Forward declare PoolingLayer and SplitLayer for use in LRNLayer. -template class PoolingLayer; -template class SplitLayer; +template class PoolingLayer; +template class SplitLayer; /** * @brief Normalize the input in a local region across or within feature maps. * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class LRNLayer : public Layer { - public: - explicit LRNLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "LRN"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - virtual void CrossChannelForward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void WithinChannelForward(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelBackward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void WithinChannelBackward(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int size_; - int pre_pad_; - Dtype alpha_; - Dtype beta_; - Dtype k_; - int num_; - int channels_; - int height_; - int width_; - - // Fields used for normalization ACROSS_CHANNELS - // scale_ stores the intermediate summing results - Blob scale_; - - // Fields used for normalization WITHIN_CHANNEL - shared_ptr > split_layer_; - vector*> split_top_vec_; - shared_ptr > square_layer_; - Blob square_input_; - Blob square_output_; - vector*> square_bottom_vec_; - vector*> square_top_vec_; - shared_ptr > pool_layer_; - Blob pool_output_; - vector*> pool_top_vec_; - shared_ptr > power_layer_; - Blob power_output_; - vector*> power_top_vec_; - shared_ptr > product_layer_; - Blob product_input_; - vector*> product_bottom_vec_; - - cl_kernel LFSkernel, LCDkernel, LCOkernel; +template +class LRNLayer: public Layer { + public: + explicit LRNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "LRN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + virtual void CrossChannelForward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void WithinChannelForward(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int size_; + int pre_pad_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int num_; + int channels_; + int height_; + int width_; + + // Fields used for normalization ACROSS_CHANNELS + // scale_ stores the intermediate summing results + Blob scale_; + + // Fields used for normalization WITHIN_CHANNEL + shared_ptr > split_layer_; + vector*> split_top_vec_; + shared_ptr > square_layer_; + Blob square_input_; + Blob square_output_; + vector*> square_bottom_vec_; + vector*> square_top_vec_; + shared_ptr > pool_layer_; + Blob pool_output_; + vector*> pool_top_vec_; + shared_ptr > power_layer_; + Blob power_output_; + vector*> power_top_vec_; + shared_ptr > product_layer_; + Blob product_input_; + vector*> product_bottom_vec_; + + cl_kernel LFSkernel, LCDkernel, LCOkernel; }; - /*n * @brief Pools the input image by taking the max, average, etc. within regions. * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class PoolingLayer : public Layer { - public: - explicit PoolingLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Pooling"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - Blob rand_idx_; - Blob max_idx_; +template +class PoolingLayer: public Layer { + public: + explicit PoolingLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Pooling"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return (this->layer_param_.pooling_param().pool() == + PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int pad_h_, pad_w_; + int channels_; + int height_, width_; + int pooled_height_, pooled_width_; + bool global_pooling_; + Blob rand_idx_; + Blob max_idx_; }; @@ -476,32 +521,32 @@ class PoolingLayer : public Layer { /* * @brief cuDNN implementation of PoolingLayer. * Fallback to PoolingLayer for CPU mode. -*/ + */ template class CuDNNPoolingLayer : public PoolingLayer { - public: - explicit CuDNNPoolingLayer(const LayerParameter& param) - : PoolingLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNPoolingLayer(); - // Currently, cuDNN does not support the extra top blob. - virtual inline int MinTopBlobs() const { return -1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_, top_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - cudnnPoolingMode_t mode_; + public: + explicit CuDNNPoolingLayer(const LayerParameter& param) + : PoolingLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNPoolingLayer(); + // Currently, cuDNN does not support the extra top blob. + virtual inline int MinTopBlobs() const {return -1;} + virtual inline int ExactNumTopBlobs() const {return 1;} + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_, top_desc_; + cudnnPoolingDescriptor_t pooling_desc_; + cudnnPoolingMode_t mode_; }; #endif @@ -511,64 +556,71 @@ class CuDNNPoolingLayer : public PoolingLayer { * so that the result vector of different sized * images are of the same size. */ -template -class SPPLayer : public Layer { - public: - explicit SPPLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SPP"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - // calculates the kernel and stride dimensions for the pooling layer, - // returns a correctly configured LayerParameter for a PoolingLayer - virtual LayerParameter GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param); - - int pyramid_height_; - int bottom_h_, bottom_w_; - int channels_; - int kernel_h_, kernel_w_; - int pad_h_, pad_w_; - - /// the internal Split layer that feeds the pooling layers - shared_ptr > split_layer_; - /// top vector holder used in call to the underlying SplitLayer::Forward - vector*> split_top_vec_; - /// bottom vector holder used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_bottom_vecs_; - /// the internal Pooling layers of different kernel sizes - vector > > pooling_layers_; - /// top vector holders used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_top_vecs_; - /// pooling_outputs stores the outputs of the PoolingLayers - vector*> pooling_outputs_; - /// the internal Flatten layers that the Pooling layers feed into - vector*> flatten_layers_; - /// top vector holders used in call to the underlying FlattenLayer::Forward - vector*>*> flatten_top_vecs_; - /// flatten_outputs stores the outputs of the FlattenLayers - vector*> flatten_outputs_; - /// bottom vector holder used in call to the underlying ConcatLayer::Forward - vector*> concat_bottom_vec_; - /// the internal Concat layers that the Flatten layers feed into - shared_ptr > concat_layer_; +template +class SPPLayer: public Layer { + public: + explicit SPPLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SPP"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return (this->layer_param_.pooling_param().pool() == + PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + // calculates the kernel and stride dimensions for the pooling layer, + // returns a correctly configured LayerParameter for a PoolingLayer + virtual LayerParameter GetPoolingParam(const int pyramid_level, + const int bottom_h, const int bottom_w, const SPPParameter spp_param); + + int pyramid_height_; + int bottom_h_, bottom_w_; + int channels_; + int kernel_h_, kernel_w_; + int pad_h_, pad_w_; + + /// the internal Split layer that feeds the pooling layers + shared_ptr > split_layer_; + /// top vector holder used in call to the underlying SplitLayer::Forward + vector*> split_top_vec_; + /// bottom vector holder used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_bottom_vecs_; + /// the internal Pooling layers of different kernel sizes + vector > > pooling_layers_; + /// top vector holders used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_top_vecs_; + /// pooling_outputs stores the outputs of the PoolingLayers + vector*> pooling_outputs_; + /// the internal Flatten layers that the Pooling layers feed into + vector*> flatten_layers_; + /// top vector holders used in call to the underlying FlattenLayer::Forward + vector*>*> flatten_top_vecs_; + /// flatten_outputs stores the outputs of the FlattenLayers + vector*> flatten_outputs_; + /// bottom vector holder used in call to the underlying ConcatLayer::Forward + vector*> concat_bottom_vec_; + /// the internal Concat layers that the Flatten layers feed into + shared_ptr > concat_layer_; }; } // namespace caffe diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 4cec89ae..e7d129bb 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -8,494 +8,510 @@ namespace caffe { -template +template void Blob::Reshape(const int num, const int channels, const int height, - const int width) { - vector shape(4); - shape[0] = num; - shape[1] = channels; - shape[2] = height; - shape[3] = width; - Reshape(shape); + const int width) { + vector shape(4); + shape[0] = num; + shape[1] = channels; + shape[2] = height; + shape[3] = width; + Reshape(shape); } -template +template void Blob::Reshape(const vector& shape) { - CHECK_LE(shape.size(), kMaxBlobAxes); - count_ = 1; - shape_.resize(shape.size()); - for (int i = 0; i < shape.size(); ++i) { - CHECK_GE(shape[i], 0); - CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; - count_ *= shape[i]; - shape_[i] = shape[i]; - } - if (count_ > capacity_) { - capacity_ = count_; - data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); - diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); - } -} - -template + CHECK_LE(shape.size(), kMaxBlobAxes); + count_ = 1; + shape_.resize(shape.size()); + for (int i = 0; i < shape.size(); ++i) { + CHECK_GE(shape[i], 0); + CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; + count_ *= shape[i]; + shape_[i] = shape[i]; + } + if (count_ > capacity_) { + capacity_ = count_; + data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); + diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); + } +} + +template void Blob::Reshape(const BlobShape& shape) { - CHECK_LE(shape.dim_size(), kMaxBlobAxes); - vector shape_vec(shape.dim_size()); - for (int i = 0; i < shape.dim_size(); ++i) { - shape_vec[i] = shape.dim(i); - } - Reshape(shape_vec); + CHECK_LE(shape.dim_size(), kMaxBlobAxes); + vector shape_vec(shape.dim_size()); + for (int i = 0; i < shape.dim_size(); ++i) { + shape_vec[i] = shape.dim(i); + } + Reshape(shape_vec); } -template +template void Blob::ReshapeLike(const Blob& other) { - Reshape(other.shape()); + Reshape(other.shape()); } -template +template Blob::Blob(const int num, const int channels, const int height, - const int width) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { - Reshape(num, channels, height, width); + const int width) + // capacity_ must be initialized before calling Reshape + : capacity_(0) { + Reshape(num, channels, height, width); } -template +template Blob::Blob(const vector& shape) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { - Reshape(shape); + // capacity_ must be initialized before calling Reshape + : capacity_(0) { + Reshape(shape); } -template +template const Dtype* Blob::cpu_data() const { - CHECK(data_); - return (const Dtype*)data_->cpu_data(); + CHECK (data_); + return (const Dtype*) data_->cpu_data(); } -template +template void Blob::set_cpu_data(Dtype* data) { - CHECK(data); - data_->set_cpu_data(data); + CHECK(data); + data_->set_cpu_data(data); } -template +template const Dtype* Blob::gpu_data() const { - CHECK(data_); - return (const Dtype*)data_->gpu_data(); + CHECK (data_); + return (const Dtype*) data_->gpu_data(); } -template +template const Dtype* Blob::gpu_cache_data() const { - CHECK(data_); - return (const Dtype*)data_->gpu_cache_data(); + CHECK (data_); + return (const Dtype*) data_->gpu_cache_data(); } -template +template const Dtype* Blob::cpu_diff() const { - CHECK(diff_); - return (const Dtype*)diff_->cpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->cpu_data(); } -template +template const Dtype* Blob::gpu_diff() const { - CHECK(diff_); - return (const Dtype*)diff_->gpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->gpu_data(); } -template +template Dtype* Blob::mutable_cpu_data() { - CHECK(data_); - return static_cast(data_->mutable_cpu_data()); + CHECK (data_); + return static_cast(data_->mutable_cpu_data()); } -template +template Dtype* Blob::mutable_gpu_data() { - CHECK(data_); - return static_cast(data_->mutable_gpu_data()); + CHECK (data_); + return static_cast(data_->mutable_gpu_data()); } -template +template Dtype* Blob::mutable_cpu_diff() { - CHECK(diff_); - return static_cast(diff_->mutable_cpu_data()); + CHECK (diff_); + return static_cast(diff_->mutable_cpu_data()); } -template +template Dtype* Blob::mutable_gpu_diff() { - CHECK(diff_); - return static_cast(diff_->mutable_gpu_data()); + CHECK (diff_); + return static_cast(diff_->mutable_gpu_data()); } -template +template void Blob::ShareData(const Blob& other) { - CHECK_EQ(count_, other.count()); - data_ = other.data(); + CHECK_EQ(count_, other.count()); + data_ = other.data(); } -template +template void Blob::ShareDiff(const Blob& other) { - CHECK_EQ(count_, other.count()); - diff_ = other.diff(); + CHECK_EQ(count_, other.count()); + diff_ = other.diff(); } // The "update" method is used for parameter blobs in a Net, which are stored // as Blob or Blob -- hence we do not define it for // Blob or Blob. -template <> void Blob::Update() { NOT_IMPLEMENTED; } -template <> void Blob::Update() { NOT_IMPLEMENTED; } +template<> void Blob::Update() { + NOT_IMPLEMENTED; +} +template<> void Blob::Update() { + NOT_IMPLEMENTED; +} -template +template void Blob::Update() { - // We will perform update based on where the data is located. - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - // perform computation on CPU - caffe_axpy(count_, Dtype(-1), - static_cast(diff_->cpu_data()), - static_cast(data_->mutable_cpu_data())); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - // perform computation on GPU - caffe_gpu_axpy(count_, Dtype(-1), - static_cast(diff_->gpu_data()), - static_cast(data_->mutable_gpu_data())); + // We will perform update based on where the data is located. + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + // perform computation on CPU + caffe_axpy < Dtype > (count_, Dtype(-1), + static_cast(diff_->cpu_data()), + static_cast(data_->mutable_cpu_data())); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + // perform computation on GPU + caffe_gpu_axpy < Dtype > (count_, Dtype(-1), + static_cast(diff_->gpu_data()), + static_cast(data_->mutable_gpu_data())); #else - NO_GPU; + NO_GPU; #endif - break; - default: - LOG(FATAL) << "Syncedmem not initialized."; - } + break; + default: + LOG(FATAL) << "Syncedmem not initialized."; + } } -template <> unsigned int Blob::asum_data() const { - NOT_IMPLEMENTED; - return 0; +template<> unsigned int Blob::asum_data() const { + NOT_IMPLEMENTED; + return 0; } -template <> int Blob::asum_data() const { - NOT_IMPLEMENTED; - return 0; +template<> int Blob::asum_data() const { + NOT_IMPLEMENTED; + return 0; } -template +template Dtype Blob::asum_data() const { - if (!data_) { return 0; } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - return caffe_cpu_asum(count_, cpu_data()); - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - { - Dtype asum; - caffe_gpu_asum(count_, gpu_data(), &asum); - return asum; - } + if (!data_) { + return 0; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_data()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_data(), &asum); + return asum; + } #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return 0; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return 0; } -template <> unsigned int Blob::asum_diff() const { - NOT_IMPLEMENTED; - return 0; +template<> unsigned int Blob::asum_diff() const { + NOT_IMPLEMENTED; + return 0; } -template <> int Blob::asum_diff() const { - NOT_IMPLEMENTED; - return 0; +template<> int Blob::asum_diff() const { + NOT_IMPLEMENTED; + return 0; } -template +template Dtype Blob::asum_diff() const { - if (!diff_) { return 0; } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - return caffe_cpu_asum(count_, cpu_diff()); - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - { - Dtype asum; - caffe_gpu_asum(count_, gpu_diff(), &asum); - return asum; - } + if (!diff_) { + return 0; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_diff()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_diff(), &asum); + return asum; + } #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); - } - return 0; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); + } + return 0; } -template <> unsigned int Blob::sumsq_data() const { - NOT_IMPLEMENTED; - return 0; +template<> unsigned int Blob::sumsq_data() const { + NOT_IMPLEMENTED; + return 0; } -template <> int Blob::sumsq_data() const { - NOT_IMPLEMENTED; - return 0; +template<> int Blob::sumsq_data() const { + NOT_IMPLEMENTED; + return 0; } -template +template Dtype Blob::sumsq_data() const { - Dtype sumsq; - const Dtype* data; - if (!data_) { return 0; } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - data = cpu_data(); - sumsq = caffe_cpu_dot(count_, data, data); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - data = gpu_data(); - caffe_gpu_dot(count_, data, data, &sumsq); + Dtype sumsq; + const Dtype* data; + if (!data_) { + return 0; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + data = cpu_data(); + sumsq = caffe_cpu_dot(count_, data, data); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + data = gpu_data(); + caffe_gpu_dot(count_, data, data, &sumsq); #else - NO_GPU; + NO_GPU; #endif - break; - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return sumsq; + break; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return sumsq; } -template <> unsigned int Blob::sumsq_diff() const { - NOT_IMPLEMENTED; - return 0; +template<> unsigned int Blob::sumsq_diff() const { + NOT_IMPLEMENTED; + return 0; } -template <> int Blob::sumsq_diff() const { - NOT_IMPLEMENTED; - return 0; +template<> int Blob::sumsq_diff() const { + NOT_IMPLEMENTED; + return 0; } -template +template Dtype Blob::sumsq_diff() const { - Dtype sumsq; - const Dtype* diff; - if (!diff_) { return 0; } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - diff = cpu_diff(); - sumsq = caffe_cpu_dot(count_, diff, diff); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - diff = gpu_diff(); - caffe_gpu_dot(count_, diff, diff, &sumsq); - break; + Dtype sumsq; + const Dtype* diff; + if (!diff_) { + return 0; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + diff = cpu_diff(); + sumsq = caffe_cpu_dot(count_, diff, diff); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + diff = gpu_diff(); + caffe_gpu_dot(count_, diff, diff, &sumsq); + break; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return sumsq; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return sumsq; } -template <> void Blob::scale_data(unsigned int scale_factor) { - NOT_IMPLEMENTED; +template<> void Blob::scale_data(unsigned int scale_factor) { + NOT_IMPLEMENTED; } -template <> void Blob::scale_data(int scale_factor) { - NOT_IMPLEMENTED; +template<> void Blob::scale_data(int scale_factor) { + NOT_IMPLEMENTED; } -template +template void Blob::scale_data(Dtype scale_factor) { - Dtype* data; - if (!data_) { return; } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - data = mutable_cpu_data(); - caffe_scal(count_, scale_factor, data); - return; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - data = mutable_gpu_data(); - caffe_gpu_scal(count_, scale_factor, data); - return; + Dtype* data; + if (!data_) { + return; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + data = mutable_cpu_data(); + caffe_scal(count_, scale_factor, data); + return; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + data = mutable_gpu_data(); + caffe_gpu_scal(count_, scale_factor, data); + return; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } + case SyncedMemory::UNINITIALIZED: + return; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } } -template <> void Blob::scale_diff(unsigned int scale_factor) { - NOT_IMPLEMENTED; +template<> void Blob::scale_diff(unsigned int scale_factor) { + NOT_IMPLEMENTED; } -template <> void Blob::scale_diff(int scale_factor) { - NOT_IMPLEMENTED; +template<> void Blob::scale_diff(int scale_factor) { + NOT_IMPLEMENTED; } -template +template void Blob::scale_diff(Dtype scale_factor) { - Dtype* diff; - if (!diff_) { return; } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - diff = mutable_cpu_diff(); - caffe_scal(count_, scale_factor, diff); - return; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: -#ifndef CPU_ONLY - diff = mutable_gpu_diff(); - caffe_gpu_scal(count_, scale_factor, diff); - return; + Dtype* diff; + if (!diff_) { + return; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + diff = mutable_cpu_diff(); + caffe_scal(count_, scale_factor, diff); + return; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: + #ifndef CPU_ONLY + diff = mutable_gpu_diff(); + caffe_gpu_scal(count_, scale_factor, diff); + return; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); - } + case SyncedMemory::UNINITIALIZED: + return; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); + } } -template +template bool Blob::ShapeEquals(const BlobProto& other) { - if (other.has_num() || other.has_channels() || - other.has_height() || other.has_width()) { - // Using deprecated 4D Blob dimensions -- - // shape is (num, channels, height, width). - // Note: we do not use the normal Blob::num(), Blob::channels(), etc. - // methods as these index from the beginning of the blob shape, where legacy - // parameter blobs were indexed from the end of the blob shape (e.g., bias - // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). - return shape_.size() <= 4 && - LegacyShape(-4) == other.num() && - LegacyShape(-3) == other.channels() && - LegacyShape(-2) == other.height() && - LegacyShape(-1) == other.width(); - } - vector other_shape(other.shape().dim_size()); - for (int i = 0; i < other.shape().dim_size(); ++i) { - other_shape[i] = other.shape().dim(i); - } - return shape_ == other_shape; -} - -template + if (other.has_num() || other.has_channels() || + other.has_height() || other.has_width()) { + // Using deprecated 4D Blob dimensions -- + // shape is (num, channels, height, width). + // Note: we do not use the normal Blob::num(), Blob::channels(), etc. + // methods as these index from the beginning of the blob shape, where legacy + // parameter blobs were indexed from the end of the blob shape (e.g., bias + // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). + return shape_.size() <= 4 && + LegacyShape(-4) == other.num() && + LegacyShape(-3) == other.channels() && + LegacyShape(-2) == other.height() && + LegacyShape(-1) == other.width(); + } + vector other_shape(other.shape().dim_size()); + for (int i = 0; i < other.shape().dim_size(); ++i) { + other_shape[i] = other.shape().dim(i); + } + return shape_ == other_shape; +} + +template void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { - if (source.count() != count_ || source.shape() != shape_) { - if (reshape) { - ReshapeLike(source); - } else { - LOG(FATAL) << "Trying to copy blobs of different sizes."; - } - } - switch (Caffe::mode()) { - case Caffe::GPU: - if (copy_diff) { - caffe_copy(count_, source.gpu_diff(), - static_cast(diff_->mutable_gpu_data())); - } else { - caffe_copy(count_, source.gpu_data(), - static_cast(data_->mutable_gpu_data())); - } - break; - case Caffe::CPU: - if (copy_diff) { - caffe_copy(count_, source.cpu_diff(), - static_cast(diff_->mutable_cpu_data())); - } else { - caffe_copy(count_, source.cpu_data(), - static_cast(data_->mutable_cpu_data())); - } - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } -} - -template + if (source.count() != count_ || source.shape() != shape_) { + if (reshape) { + ReshapeLike(source); + } else { + LOG(FATAL) << "Trying to copy blobs of different sizes."; + } + } + switch (Caffe::mode()) { + case Caffe::GPU: + if (copy_diff) { + caffe_copy(count_, source.gpu_diff(), + static_cast(diff_->mutable_gpu_data())); + } else { + caffe_copy(count_, source.gpu_data(), + static_cast(data_->mutable_gpu_data())); + } + break; + case Caffe::CPU: + if (copy_diff) { + caffe_copy(count_, source.cpu_diff(), + static_cast(diff_->mutable_cpu_data())); + } else { + caffe_copy(count_, source.cpu_data(), + static_cast(data_->mutable_cpu_data())); + } + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } +} + +template void Blob::FromProto(const BlobProto& proto, bool reshape) { - if (reshape) { - vector shape; - if (proto.has_num() || proto.has_channels() || - proto.has_height() || proto.has_width()) { - // Using deprecated 4D Blob dimensions -- - // shape is (num, channels, height, width). - shape.resize(4); - shape[0] = proto.num(); - shape[1] = proto.channels(); - shape[2] = proto.height(); - shape[3] = proto.width(); - } else { - shape.resize(proto.shape().dim_size()); - for (int i = 0; i < proto.shape().dim_size(); ++i) { - shape[i] = proto.shape().dim(i); - } - } - Reshape(shape); - } else { - CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)"; - } - // copy data - Dtype* data_vec = mutable_cpu_data(); - for (int i = 0; i < count_; ++i) { - data_vec[i] = proto.data(i); - } - if (proto.diff_size() > 0) { - Dtype* diff_vec = mutable_cpu_diff(); - for (int i = 0; i < count_; ++i) { - diff_vec[i] = proto.diff(i); - } - } -} - -template + if (reshape) { + vector shape; + if (proto.has_num() || proto.has_channels() || + proto.has_height() || proto.has_width()) { + // Using deprecated 4D Blob dimensions -- + // shape is (num, channels, height, width). + shape.resize(4); + shape[0] = proto.num(); + shape[1] = proto.channels(); + shape[2] = proto.height(); + shape[3] = proto.width(); + } else { + shape.resize(proto.shape().dim_size()); + for (int i = 0; i < proto.shape().dim_size(); ++i) { + shape[i] = proto.shape().dim(i); + } + } + Reshape(shape); + } else { + CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)"; + } + // copy data + Dtype* data_vec = mutable_cpu_data(); + for (int i = 0; i < count_; ++i) { + data_vec[i] = proto.data(i); + } + if (proto.diff_size() > 0) { + Dtype* diff_vec = mutable_cpu_diff(); + for (int i = 0; i < count_; ++i) { + diff_vec[i] = proto.diff(i); + } + } +} + +template void Blob::ToProto(BlobProto* proto, bool write_diff) const { - proto->clear_shape(); - for (int i = 0; i < shape_.size(); ++i) { - proto->mutable_shape()->add_dim(shape_[i]); - } - proto->clear_data(); - proto->clear_diff(); - const Dtype* data_vec = cpu_data(); - for (int i = 0; i < count_; ++i) { - proto->add_data(data_vec[i]); - } - if (write_diff) { - const Dtype* diff_vec = cpu_diff(); - for (int i = 0; i < count_; ++i) { - proto->add_diff(diff_vec[i]); - } - } -} - -INSTANTIATE_CLASS(Blob); -template class Blob; -template class Blob; + proto->clear_shape(); + for (int i = 0; i < shape_.size(); ++i) { + proto->mutable_shape()->add_dim(shape_[i]); + } + proto->clear_data(); + proto->clear_diff(); + const Dtype* data_vec = cpu_data(); + for (int i = 0; i < count_; ++i) { + proto->add_data(data_vec[i]); + } + if (write_diff) { + const Dtype* diff_vec = cpu_diff(); + for (int i = 0; i < count_; ++i) { + proto->add_diff(diff_vec[i]); + } + } +} + +INSTANTIATE_CLASS (Blob); +template class Blob ; +template class Blob ; } // namespace caffe diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index c1d26ab8..a6ea3a57 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -11,135 +11,142 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { - //To fix: for now we use fixed seed to get same result each time -/* - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - - LOG(INFO) << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) - fclose(f); - - pid = getpid(); - s = time(NULL); - seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); - //return seed; - LOG(WARNING) << "return fixed seed 37"; -*/ - return 37; + //To fix: for now we use fixed seed to get same result each time + /* + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + + LOG(INFO) << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + + pid = getpid(); + s = time(NULL); + seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); + //return seed; + LOG(WARNING) << "return fixed seed 37"; + */ + return 37; } - void GlobalInit(int* pargc, char*** pargv) { - // Google flags. - ::gflags::ParseCommandLineFlags(pargc, pargv, true); - // Google logging. - ::google::InitGoogleLogging(*(pargv)[0]); - // Provide a backtrace on segfault. - ::google::InstallFailureSignalHandler(); + // Google flags. + ::gflags::ParseCommandLineFlags(pargc, pargv, true); + // Google logging. + ::google::InitGoogleLogging(*(pargv)[0]); + // Provide a backtrace on segfault. + ::google::InstallFailureSignalHandler(); } #ifdef CPU_ONLY // CPU-only Caffe. Caffe::Caffe() - : random_generator_(), mode_(Caffe::CPU) { - } +: random_generator_(), mode_(Caffe::CPU) { +} -Caffe::~Caffe() { +Caffe::~Caffe() { } void Caffe::set_random_seed(const unsigned int seed) { - // RNG seed - Get().random_generator_.reset(new RNG(seed)); + // RNG seed + Get().random_generator_.reset(new RNG(seed)); } void Caffe::SetDevice(const int device_id) { - NO_GPU; + NO_GPU; } void Caffe::DeviceQuery() { - NO_GPU; + NO_GPU; } - class Caffe::RNG::Generator { - public: - Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} - explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} - caffe::rng_t* rng() { return rng_.get(); } - private: - shared_ptr rng_; + public: + Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} + explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} + caffe::rng_t* rng() {return rng_.get();} + private: + shared_ptr rng_; }; -Caffe::RNG::RNG() : generator_(new Generator()) { } +Caffe::RNG::RNG() : generator_(new Generator()) {} -Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { } +Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {} Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { - generator_ = other.generator_; - return *this; + generator_ = other.generator_; + return *this; } void* Caffe::RNG::generator() { - return static_cast(generator_->rng()); + return static_cast(generator_->rng()); } #else // Normal GPU + CPU Caffe. Caffe::Caffe() { - cl_int err = clblasSetup(); - if(err != CL_SUCCESS){ - LOG(ERROR) << "clBLAS setup failed "< rng_; + public: + Generator() + : rng_(new caffe::rng_t(cluster_seedgen())) { + } + explicit Generator(unsigned int seed) + : rng_(new caffe::rng_t(seed)) { + } + caffe::rng_t* rng() { + return rng_.get(); + } + private: + shared_ptr rng_; }; -Caffe::RNG::RNG() : generator_(new Generator()) { } +Caffe::RNG::RNG() + : generator_(new Generator()) { +} -Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { } +Caffe::RNG::RNG(unsigned int seed) + : generator_(new Generator(seed)) { +} Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { - generator_.reset(other.generator_.get()); - return *this; + generator_.reset(other.generator_.get()); + return *this; } void* Caffe::RNG::generator() { - return static_cast(generator_->rng()); + return static_cast(generator_->rng()); } #endif // CPU_ONLY diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index f6d80dc2..892d758d 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -12,519 +12,518 @@ namespace caffe { template DataTransformer::DataTransformer(const TransformationParameter& param, - Phase phase) - : param_(param), phase_(phase) { - // check if we want to use mean_file - if (param_.has_mean_file()) { - CHECK_EQ(param_.mean_value_size(), 0) << - "Cannot specify mean_file and mean_value at the same time"; - const string& mean_file = param.mean_file(); - LOG(INFO) << "Loading mean file from: " << mean_file; - BlobProto blob_proto; - ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); - data_mean_.FromProto(blob_proto); - } - // check if we want to use mean_value - if (param_.mean_value_size() > 0) { - CHECK(param_.has_mean_file() == false) << - "Cannot specify mean_file and mean_value at the same time"; - for (int c = 0; c < param_.mean_value_size(); ++c) { - mean_values_.push_back(param_.mean_value(c)); - } - } + Phase phase) + : param_(param), phase_(phase) { + // check if we want to use mean_file + if (param_.has_mean_file()) { + CHECK_EQ(param_.mean_value_size(), 0) << + "Cannot specify mean_file and mean_value at the same time"; + const string& mean_file = param.mean_file(); + LOG(INFO) << "Loading mean file from: " << mean_file; + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + data_mean_.FromProto(blob_proto); + } + // check if we want to use mean_value + if (param_.mean_value_size() > 0) { + CHECK(param_.has_mean_file() == false) << + "Cannot specify mean_file and mean_value at the same time"; + for (int c = 0; c < param_.mean_value_size(); ++c) { + mean_values_.push_back(param_.mean_value(c)); + } + } } template void DataTransformer::Transform(const Datum& datum, - Dtype* transformed_data) { - const string& data = datum.data(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - - const int crop_size = param_.crop_size(); - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_uint8 = data.size() > 0; - const bool has_mean_values = mean_values_.size() > 0; - - CHECK_GT(datum_channels, 0); - CHECK_GE(datum_height, crop_size); - CHECK_GE(datum_width, crop_size); - - Dtype* mean = NULL; - if (has_mean_file) { - CHECK_EQ(datum_channels, data_mean_.channels()); - CHECK_EQ(datum_height, data_mean_.height()); - CHECK_EQ(datum_width, data_mean_.width()); - mean = data_mean_.mutable_cpu_data(); - } - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) << - "Specify either 1 mean_value or as many as channels: " << datum_channels; - if (datum_channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < datum_channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } - - int height = datum_height; - int width = datum_width; - - int h_off = 0; - int w_off = 0; - if (crop_size) { - height = crop_size; - width = crop_size; - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(datum_height - crop_size + 1); - w_off = Rand(datum_width - crop_size + 1); - } else { - h_off = (datum_height - crop_size) / 2; - w_off = (datum_width - crop_size) / 2; - } - } - - Dtype datum_element; - int top_index, data_index; - for (int c = 0; c < datum_channels; ++c) { - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - data_index = (c * datum_height + h_off + h) * datum_width + w_off + w; - if (do_mirror) { - top_index = (c * height + h) * width + (width - 1 - w); - } else { - top_index = (c * height + h) * width + w; - } - if (has_uint8) { - datum_element = - static_cast(static_cast(data[data_index])); - } else { - datum_element = datum.float_data(data_index); - } - if (has_mean_file) { - transformed_data[top_index] = - (datum_element - mean[data_index]) * scale; - } else { - if (has_mean_values) { - transformed_data[top_index] = - (datum_element - mean_values_[c]) * scale; - } else { - transformed_data[top_index] = datum_element * scale; - } - } - } - } - } + Dtype* transformed_data) { + const string& data = datum.data(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + + const int crop_size = param_.crop_size(); + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_uint8 = data.size() > 0; + const bool has_mean_values = mean_values_.size() > 0; + + CHECK_GT(datum_channels, 0); + CHECK_GE(datum_height, crop_size); + CHECK_GE(datum_width, crop_size); + + Dtype* mean = NULL; + if (has_mean_file) { + CHECK_EQ(datum_channels, data_mean_.channels()); + CHECK_EQ(datum_height, data_mean_.height()); + CHECK_EQ(datum_width, data_mean_.width()); + mean = data_mean_.mutable_cpu_data(); + } + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) << + "Specify either 1 mean_value or as many as channels: " << datum_channels; + if (datum_channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < datum_channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } + + int height = datum_height; + int width = datum_width; + + int h_off = 0; + int w_off = 0; + if (crop_size) { + height = crop_size; + width = crop_size; + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(datum_height - crop_size + 1); + w_off = Rand(datum_width - crop_size + 1); + } else { + h_off = (datum_height - crop_size) / 2; + w_off = (datum_width - crop_size) / 2; + } + } + + Dtype datum_element; + int top_index, data_index; + for (int c = 0; c < datum_channels; ++c) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + data_index = (c * datum_height + h_off + h) * datum_width + w_off + w; + if (do_mirror) { + top_index = (c * height + h) * width + (width - 1 - w); + } else { + top_index = (c * height + h) * width + w; + } + if (has_uint8) { + datum_element = + static_cast(static_cast(data[data_index])); + } else { + datum_element = datum.float_data(data_index); + } + if (has_mean_file) { + transformed_data[top_index] = + (datum_element - mean[data_index]) * scale; + } else { + if (has_mean_values) { + transformed_data[top_index] = + (datum_element - mean_values_[c]) * scale; + } else { + transformed_data[top_index] = datum_element * scale; + } + } + } + } + } } template void DataTransformer::Transform(const Datum& datum, - Blob* transformed_blob) { - - // If datum is encoded, decoded and transform the cv::image. - if (datum.encoded()) { - CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; - cv::Mat cv_img; - if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. - cv_img = DecodeDatumToCVMat(datum, param_.force_color()); - } else { - cv_img = DecodeDatumToCVMatNative(datum); - } - // Transform the cv::image into blob. - return Transform(cv_img, transformed_blob); - } else { - if (param_.force_color() || param_.force_gray()) { - LOG(ERROR) << "force_color and force_gray only for encoded datum"; - } - } - - const int crop_size = param_.crop_size(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - - // Check dimensions. - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int num = transformed_blob->num(); - - CHECK_EQ(channels, datum_channels); - CHECK_LE(height, datum_height); - CHECK_LE(width, datum_width); - CHECK_GE(num, 1); - - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - } else { - CHECK_EQ(datum_height, height); - CHECK_EQ(datum_width, width); - } - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - Transform(datum, transformed_data); + Blob* transformed_blob) { + + // If datum is encoded, decoded and transform the cv::image. + if (datum.encoded()) { + CHECK(!(param_.force_color() && param_.force_gray())) + << "cannot set both force_color and force_gray"; + cv::Mat cv_img; + if (param_.force_color() || param_.force_gray()) { + // If force_color then decode in color otherwise decode in gray. + cv_img = DecodeDatumToCVMat(datum, param_.force_color()); + } else { + cv_img = DecodeDatumToCVMatNative(datum); + } + // Transform the cv::image into blob. + return Transform(cv_img, transformed_blob); + } else { + if (param_.force_color() || param_.force_gray()) { + LOG(ERROR) << "force_color and force_gray only for encoded datum"; + } + } + + const int crop_size = param_.crop_size(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + + // Check dimensions. + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int num = transformed_blob->num(); + + CHECK_EQ(channels, datum_channels); + CHECK_LE(height, datum_height); + CHECK_LE(width, datum_width); + CHECK_GE(num, 1); + + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + } else { + CHECK_EQ(datum_height, height); + CHECK_EQ(datum_width, width); + } + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + Transform(datum, transformed_data); } template void DataTransformer::Transform(const vector & datum_vector, - Blob* transformed_blob) { - const int datum_num = datum_vector.size(); - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - - CHECK_GT(datum_num, 0) << "There is no datum to add"; - CHECK_LE(datum_num, num) << - "The size of datum_vector must be no greater than transformed_blob->num()"; - Blob uni_blob(1, channels, height, width); - for (int item_id = 0; item_id < datum_num; ++item_id) { - int offset = transformed_blob->offset(item_id); - uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); - Transform(datum_vector[item_id], &uni_blob); - } + Blob* transformed_blob) { + const int datum_num = datum_vector.size(); + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + + CHECK_GT(datum_num, 0) << "There is no datum to add"; + CHECK_LE(datum_num, num) << + "The size of datum_vector must be no greater than transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); + for (int item_id = 0; item_id < datum_num; ++item_id) { + int offset = transformed_blob->offset(item_id); + uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); + Transform(datum_vector[item_id], &uni_blob); + } } template void DataTransformer::Transform(const vector & mat_vector, - Blob* transformed_blob) { - const int mat_num = mat_vector.size(); - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - - CHECK_GT(mat_num, 0) << "There is no MAT to add"; - CHECK_EQ(mat_num, num) << - "The size of mat_vector must be equals to transformed_blob->num()"; - Blob uni_blob(1, channels, height, width); - for (int item_id = 0; item_id < mat_num; ++item_id) { - int offset = transformed_blob->offset(item_id); - uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); - Transform(mat_vector[item_id], &uni_blob); - } + Blob* transformed_blob) { + const int mat_num = mat_vector.size(); + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + + CHECK_GT(mat_num, 0) << "There is no MAT to add"; + CHECK_EQ(mat_num, num) << + "The size of mat_vector must be equals to transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); + for (int item_id = 0; item_id < mat_num; ++item_id) { + int offset = transformed_blob->offset(item_id); + uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); + Transform(mat_vector[item_id], &uni_blob); + } } template void DataTransformer::Transform(const cv::Mat& cv_img, - Blob* transformed_blob) { - const int crop_size = param_.crop_size(); - const int img_channels = cv_img.channels(); - const int img_height = cv_img.rows; - const int img_width = cv_img.cols; - - // Check dimensions. - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int num = transformed_blob->num(); - - CHECK_EQ(channels, img_channels); - CHECK_LE(height, img_height); - CHECK_LE(width, img_width); - CHECK_GE(num, 1); - - CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; - - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_mean_values = mean_values_.size() > 0; - - CHECK_GT(img_channels, 0); - CHECK_GE(img_height, crop_size); - CHECK_GE(img_width, crop_size); - - Dtype* mean = NULL; - if (has_mean_file) { - CHECK_EQ(img_channels, data_mean_.channels()); - CHECK_EQ(img_height, data_mean_.height()); - CHECK_EQ(img_width, data_mean_.width()); - mean = data_mean_.mutable_cpu_data(); - } - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) << - "Specify either 1 mean_value or as many as channels: " << img_channels; - if (img_channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < img_channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } - - int h_off = 0; - int w_off = 0; - cv::Mat cv_cropped_img = cv_img; - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(img_height - crop_size + 1); - w_off = Rand(img_width - crop_size + 1); - } else { - h_off = (img_height - crop_size) / 2; - w_off = (img_width - crop_size) / 2; - } - cv::Rect roi(w_off, h_off, crop_size, crop_size); - cv_cropped_img = cv_img(roi); - } else { - CHECK_EQ(img_height, height); - CHECK_EQ(img_width, width); - } - - CHECK(cv_cropped_img.data); - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - int top_index; - for (int h = 0; h < height; ++h) { - const uchar* ptr = cv_cropped_img.ptr(h); - int img_index = 0; - for (int w = 0; w < width; ++w) { - for (int c = 0; c < img_channels; ++c) { - if (do_mirror) { - top_index = (c * height + h) * width + (width - 1 - w); - } else { - top_index = (c * height + h) * width + w; - } - // int top_index = (c * height + h) * width + w; - Dtype pixel = static_cast(ptr[img_index++]); - if (has_mean_file) { - int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; - transformed_data[top_index] = - (pixel - mean[mean_index]) * scale; - } else { - if (has_mean_values) { - transformed_data[top_index] = - (pixel - mean_values_[c]) * scale; - } else { - transformed_data[top_index] = pixel * scale; - } - } - } - } - } + Blob* transformed_blob) { + const int crop_size = param_.crop_size(); + const int img_channels = cv_img.channels(); + const int img_height = cv_img.rows; + const int img_width = cv_img.cols; + + // Check dimensions. + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int num = transformed_blob->num(); + + CHECK_EQ(channels, img_channels); + CHECK_LE(height, img_height); + CHECK_LE(width, img_width); + CHECK_GE(num, 1); + + CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; + + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_mean_values = mean_values_.size() > 0; + + CHECK_GT(img_channels, 0); + CHECK_GE(img_height, crop_size); + CHECK_GE(img_width, crop_size); + + Dtype* mean = NULL; + if (has_mean_file) { + CHECK_EQ(img_channels, data_mean_.channels()); + CHECK_EQ(img_height, data_mean_.height()); + CHECK_EQ(img_width, data_mean_.width()); + mean = data_mean_.mutable_cpu_data(); + } + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) << + "Specify either 1 mean_value or as many as channels: " << img_channels; + if (img_channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < img_channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } + + int h_off = 0; + int w_off = 0; + cv::Mat cv_cropped_img = cv_img; + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(img_height - crop_size + 1); + w_off = Rand(img_width - crop_size + 1); + } else { + h_off = (img_height - crop_size) / 2; + w_off = (img_width - crop_size) / 2; + } + cv::Rect roi(w_off, h_off, crop_size, crop_size); + cv_cropped_img = cv_img(roi); + } else { + CHECK_EQ(img_height, height); + CHECK_EQ(img_width, width); + } + + CHECK(cv_cropped_img.data); + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + int top_index; + for (int h = 0; h < height; ++h) { + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < width; ++w) { + for (int c = 0; c < img_channels; ++c) { + if (do_mirror) { + top_index = (c * height + h) * width + (width - 1 - w); + } else { + top_index = (c * height + h) * width + w; + } + // int top_index = (c * height + h) * width + w; + Dtype pixel = static_cast(ptr[img_index++]); + if (has_mean_file) { + int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; + transformed_data[top_index] = + (pixel - mean[mean_index]) * scale; + } else { + if (has_mean_values) { + transformed_data[top_index] = + (pixel - mean_values_[c]) * scale; + } else { + transformed_data[top_index] = pixel * scale; + } + } + } + } + } } template void DataTransformer::Transform(Blob* input_blob, - Blob* transformed_blob) { - const int crop_size = param_.crop_size(); - const int input_num = input_blob->num(); - const int input_channels = input_blob->channels(); - const int input_height = input_blob->height(); - const int input_width = input_blob->width(); - - if (transformed_blob->count() == 0) { - // Initialize transformed_blob with the right shape. - if (crop_size) { - transformed_blob->Reshape(input_num, input_channels, - crop_size, crop_size); - } else { - transformed_blob->Reshape(input_num, input_channels, - input_height, input_width); - } - } - - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int size = transformed_blob->count(); - - CHECK_LE(input_num, num); - CHECK_EQ(input_channels, channels); - CHECK_GE(input_height, height); - CHECK_GE(input_width, width); - - - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_mean_values = mean_values_.size() > 0; - - int h_off = 0; - int w_off = 0; - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(input_height - crop_size + 1); - w_off = Rand(input_width - crop_size + 1); - } else { - h_off = (input_height - crop_size) / 2; - w_off = (input_width - crop_size) / 2; - } - } else { - CHECK_EQ(input_height, height); - CHECK_EQ(input_width, width); - } - - Dtype* input_data = input_blob->mutable_cpu_data(); - if (has_mean_file) { - CHECK_EQ(input_channels, data_mean_.channels()); - CHECK_EQ(input_height, data_mean_.height()); - CHECK_EQ(input_width, data_mean_.width()); - for (int n = 0; n < input_num; ++n) { - int offset = input_blob->offset(n); - caffe_sub(data_mean_.count(), input_data + offset, - data_mean_.cpu_data(), input_data + offset); - } - } - - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) << - "Specify either 1 mean_value or as many as channels: " << input_channels; - if (mean_values_.size() == 1) { - caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); - } else { - for (int n = 0; n < input_num; ++n) { - for (int c = 0; c < input_channels; ++c) { - int offset = input_blob->offset(n, c); - caffe_add_scalar(input_height * input_width, -(mean_values_[c]), - input_data + offset); - } - } - } - } - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - - for (int n = 0; n < input_num; ++n) { - int top_index_n = n * channels; - int data_index_n = n * channels; - for (int c = 0; c < channels; ++c) { - int top_index_c = (top_index_n + c) * height; - int data_index_c = (data_index_n + c) * input_height + h_off; - for (int h = 0; h < height; ++h) { - int top_index_h = (top_index_c + h) * width; - int data_index_h = (data_index_c + h) * input_width + w_off; - if (do_mirror) { - int top_index_w = top_index_h + width - 1; - for (int w = 0; w < width; ++w) { - transformed_data[top_index_w-w] = input_data[data_index_h + w]; - } - } else { - for (int w = 0; w < width; ++w) { - transformed_data[top_index_h + w] = input_data[data_index_h + w]; - } - } - } - } - } - if (scale != Dtype(1)) { - DLOG(INFO) << "Scale: " << scale; - caffe_scal(size, scale, transformed_data); - } + Blob* transformed_blob) { + const int crop_size = param_.crop_size(); + const int input_num = input_blob->num(); + const int input_channels = input_blob->channels(); + const int input_height = input_blob->height(); + const int input_width = input_blob->width(); + + if (transformed_blob->count() == 0) { + // Initialize transformed_blob with the right shape. + if (crop_size) { + transformed_blob->Reshape(input_num, input_channels, + crop_size, crop_size); + } else { + transformed_blob->Reshape(input_num, input_channels, + input_height, input_width); + } + } + + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int size = transformed_blob->count(); + + CHECK_LE(input_num, num); + CHECK_EQ(input_channels, channels); + CHECK_GE(input_height, height); + CHECK_GE(input_width, width); + + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_mean_values = mean_values_.size() > 0; + + int h_off = 0; + int w_off = 0; + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(input_height - crop_size + 1); + w_off = Rand(input_width - crop_size + 1); + } else { + h_off = (input_height - crop_size) / 2; + w_off = (input_width - crop_size) / 2; + } + } else { + CHECK_EQ(input_height, height); + CHECK_EQ(input_width, width); + } + + Dtype* input_data = input_blob->mutable_cpu_data(); + if (has_mean_file) { + CHECK_EQ(input_channels, data_mean_.channels()); + CHECK_EQ(input_height, data_mean_.height()); + CHECK_EQ(input_width, data_mean_.width()); + for (int n = 0; n < input_num; ++n) { + int offset = input_blob->offset(n); + caffe_sub(data_mean_.count(), input_data + offset, + data_mean_.cpu_data(), input_data + offset); + } + } + + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) << + "Specify either 1 mean_value or as many as channels: " << input_channels; + if (mean_values_.size() == 1) { + caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); + } else { + for (int n = 0; n < input_num; ++n) { + for (int c = 0; c < input_channels; ++c) { + int offset = input_blob->offset(n, c); + caffe_add_scalar(input_height * input_width, -(mean_values_[c]), + input_data + offset); + } + } + } + } + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + + for (int n = 0; n < input_num; ++n) { + int top_index_n = n * channels; + int data_index_n = n * channels; + for (int c = 0; c < channels; ++c) { + int top_index_c = (top_index_n + c) * height; + int data_index_c = (data_index_n + c) * input_height + h_off; + for (int h = 0; h < height; ++h) { + int top_index_h = (top_index_c + h) * width; + int data_index_h = (data_index_c + h) * input_width + w_off; + if (do_mirror) { + int top_index_w = top_index_h + width - 1; + for (int w = 0; w < width; ++w) { + transformed_data[top_index_w - w] = input_data[data_index_h + w]; + } + } else { + for (int w = 0; w < width; ++w) { + transformed_data[top_index_h + w] = input_data[data_index_h + w]; + } + } + } + } + } + if (scale != Dtype(1)) { + DLOG(INFO) << "Scale: " << scale; + caffe_scal(size, scale, transformed_data); + } } template vector DataTransformer::InferBlobShape(const Datum& datum) { - if (datum.encoded()) { - CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; - cv::Mat cv_img; - if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. - cv_img = DecodeDatumToCVMat(datum, param_.force_color()); - } else { - cv_img = DecodeDatumToCVMatNative(datum); - } - // InferBlobShape using the cv::image. - return InferBlobShape(cv_img); - } - - const int crop_size = param_.crop_size(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - // Check dimensions. - CHECK_GT(datum_channels, 0); - CHECK_GE(datum_height, crop_size); - CHECK_GE(datum_width, crop_size); - // Build BlobShape. - vector shape(4); - shape[0] = 1; - shape[1] = datum_channels; - shape[2] = (crop_size)? crop_size: datum_height; - shape[3] = (crop_size)? crop_size: datum_width; - return shape; + if (datum.encoded()) { + CHECK(!(param_.force_color() && param_.force_gray())) + << "cannot set both force_color and force_gray"; + cv::Mat cv_img; + if (param_.force_color() || param_.force_gray()) { + // If force_color then decode in color otherwise decode in gray. + cv_img = DecodeDatumToCVMat(datum, param_.force_color()); + } else { + cv_img = DecodeDatumToCVMatNative(datum); + } + // InferBlobShape using the cv::image. + return InferBlobShape(cv_img); + } + + const int crop_size = param_.crop_size(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + // Check dimensions. + CHECK_GT(datum_channels, 0); + CHECK_GE(datum_height, crop_size); + CHECK_GE(datum_width, crop_size); + // Build BlobShape. + vector shape(4); + shape[0] = 1; + shape[1] = datum_channels; + shape[2] = (crop_size) ? crop_size : datum_height; + shape[3] = (crop_size) ? crop_size : datum_width; + return shape; } template vector DataTransformer::InferBlobShape( - const vector & datum_vector) { - const int num = datum_vector.size(); - CHECK_GT(num, 0) << "There is no datum to in the vector"; - // Use first datum in the vector to InferBlobShape. - vector shape = InferBlobShape(datum_vector[0]); - // Adjust num to the size of the vector. - shape[0] = num; - return shape; + const vector & datum_vector) { + const int num = datum_vector.size(); + CHECK_GT(num, 0) << "There is no datum to in the vector"; + // Use first datum in the vector to InferBlobShape. + vector shape = InferBlobShape(datum_vector[0]); + // Adjust num to the size of the vector. + shape[0] = num; + return shape; } template vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { - const int crop_size = param_.crop_size(); - const int img_channels = cv_img.channels(); - const int img_height = cv_img.rows; - const int img_width = cv_img.cols; - // Check dimensions. - CHECK_GT(img_channels, 0); - CHECK_GE(img_height, crop_size); - CHECK_GE(img_width, crop_size); - // Build BlobShape. - vector shape(4); - shape[0] = 1; - shape[1] = img_channels; - shape[2] = (crop_size)? crop_size: img_height; - shape[3] = (crop_size)? crop_size: img_width; - return shape; + const int crop_size = param_.crop_size(); + const int img_channels = cv_img.channels(); + const int img_height = cv_img.rows; + const int img_width = cv_img.cols; + // Check dimensions. + CHECK_GT(img_channels, 0); + CHECK_GE(img_height, crop_size); + CHECK_GE(img_width, crop_size); + // Build BlobShape. + vector shape(4); + shape[0] = 1; + shape[1] = img_channels; + shape[2] = (crop_size) ? crop_size : img_height; + shape[3] = (crop_size) ? crop_size : img_width; + return shape; } template vector DataTransformer::InferBlobShape( - const vector & mat_vector) { - const int num = mat_vector.size(); - CHECK_GT(num, 0) << "There is no cv_img to in the vector"; - // Use first cv_img in the vector to InferBlobShape. - vector shape = InferBlobShape(mat_vector[0]); - // Adjust num to the size of the vector. - shape[0] = num; - return shape; + const vector & mat_vector) { + const int num = mat_vector.size(); + CHECK_GT(num, 0) << "There is no cv_img to in the vector"; + // Use first cv_img in the vector to InferBlobShape. + vector shape = InferBlobShape(mat_vector[0]); + // Adjust num to the size of the vector. + shape[0] = num; + return shape; } -template +template void DataTransformer::InitRand() { - const bool needs_rand = param_.mirror() || - (phase_ == TRAIN && param_.crop_size()); - if (needs_rand) { - const unsigned int rng_seed = caffe_rng_rand(); - rng_.reset(new Caffe::RNG(rng_seed)); - } else { - rng_.reset(); - } + const bool needs_rand = param_.mirror() || + (phase_ == TRAIN && param_.crop_size()); + if (needs_rand) { + const unsigned int rng_seed = caffe_rng_rand(); + rng_.reset(new Caffe::RNG(rng_seed)); + } else { + rng_.reset(); + } } -template +template int DataTransformer::Rand(int n) { - CHECK(rng_); - CHECK_GT(n, 0); - caffe::rng_t* rng = - static_cast(rng_->generator()); - return ((*rng)() % n); + CHECK (rng_); + CHECK_GT(n, 0); + caffe::rng_t* rng = + static_cast(rng_->generator()); + return ((*rng)() % n); } -INSTANTIATE_CLASS(DataTransformer); +INSTANTIATE_CLASS (DataTransformer); } // namespace caffe diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 7e745410..689f706e 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -37,378 +37,415 @@ string buildOption = "-x clc++ "; std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; -Device::~Device(){ - ReleaseKernels(); - free((void*)platformIDs); - free(DeviceIDs); - clReleaseProgram(Program); - clReleaseCommandQueue(CommandQueue); - clReleaseCommandQueue(CommandQueue_helper); - clReleaseContext(Context); - LOG(INFO) << "device destructor"; +Device::~Device() { + ReleaseKernels(); + free((void*) platformIDs); + free (DeviceIDs); + clReleaseProgram (Program); + clReleaseCommandQueue (CommandQueue); + clReleaseCommandQueue (CommandQueue_helper); + clReleaseContext (Context); + LOG(INFO) << "device destructor"; } - -cl_int Device::Init(int deviceId){ - - DisplayPlatformInfo(); - - clGetPlatformIDs(0, NULL, &numPlatforms); - cl_platform_id PlatformIDs[numPlatforms]; - clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); - - size_t nameLen; - cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); - if(res != CL_SUCCESS){ - fprintf(stderr, "Err: Failed to Get Platform Info\n"); - return 0; - } - platformName[nameLen] = 0; - - GetDeviceInfo(); - cl_uint uiNumDevices; - cl_bool unified_memory = false; - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - uiNumDevices = numDevices; - if(0 == uiNumDevices){ - LOG(FATAL) << "Err: No GPU devices"; - } else { - pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id)); - OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices)); - if (deviceId == -1) { - int i; - for (i = 0; i < (int)uiNumDevices; i++){ - clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL); - if(!unified_memory) { //skip iGPU - //we pick the first dGPU we found - pDevices[0] = pDevices[i]; - device_id = i; - LOG(INFO) << "Picked default device type : dGPU "<=0 && deviceId < uiNumDevices){ - pDevices[0] = pDevices[deviceId]; - device_id = deviceId; - LOG(INFO) << "Picked device type : GPU "<= 0 && deviceId < uiNumDevices) { + pDevices[0] = pDevices[deviceId]; + device_id = deviceId; + LOG(INFO) << "Picked device type : GPU " << device_id; + } else { + LOG(FATAL) << " Invalid GPU deviceId! "; + } + } + + Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); + if (NULL == Context) { + fprintf(stderr, "Err: Failed to Create Context\n"); + return 0; + } + CommandQueue = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + if (NULL == CommandQueue || NULL == CommandQueue_helper) { + fprintf(stderr, "Err: Failed to Create Commandqueue\n"); + return 0; + } + BuildProgram (oclKernelPath); + row = clblasRowMajor; + col = clblasColumnMajor; + return 0; } void Device::BuildProgram(std::string kernel_dir) -{ - std::string strSource = ""; - DIR *ocl_dir; - struct dirent *dirp; - if((ocl_dir=opendir(kernel_dir.c_str())) == NULL) - { - fprintf(stderr,"Err: Open ocl dir failed!\n"); - } - while((dirp = readdir(ocl_dir)) != NULL) - { - //Ignore hidden files - if(dirp->d_name[0] == '.') continue; - std::string file_name = std::string(dirp->d_name); - //Skip non *.cl files - size_t last_dot_pos = file_name.find_last_of("."); - if(file_name.substr(last_dot_pos+1) != "cl") continue; - - std::string ocl_kernel_full_path=kernel_dir+file_name; - std::string tmpSource = ""; - ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); - strSource += tmpSource; - } - const char *pSource; - pSource = strSource.c_str(); - size_t uiArrSourceSize[] = {0}; - uiArrSourceSize[0] = strlen(pSource); - Program = NULL; - Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL); - if(NULL == Program){ - fprintf(stderr,"Err: Failed to create program\n"); - } - cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), NULL, NULL); - LOG(INFO) << "Build Program"; - if(CL_SUCCESS != iStatus){ - fprintf(stderr,"Err: Failed to build program\n"); - char szBuildLog[16384]; - clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL); - std::cout << szBuildLog; - clReleaseProgram(Program); - } + { + std::string strSource = ""; + DIR *ocl_dir; + struct dirent *dirp; + if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) + { + fprintf(stderr, "Err: Open ocl dir failed!\n"); + } + while ((dirp = readdir(ocl_dir)) != NULL) + { + //Ignore hidden files + if (dirp->d_name[0] == '.') + continue; + std::string file_name = std::string(dirp->d_name); + //Skip non *.cl files + size_t last_dot_pos = file_name.find_last_of("."); + if (file_name.substr(last_dot_pos + 1) != "cl") + continue; + + std::string ocl_kernel_full_path = kernel_dir + file_name; + std::string tmpSource = ""; + ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); + strSource += tmpSource; + } + const char *pSource; + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = { 0 }; + uiArrSourceSize[0] = strlen(pSource); + Program = NULL; + Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, + NULL); + if (NULL == Program) { + fprintf(stderr, "Err: Failed to create program\n"); + } + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), + NULL, NULL); + LOG(INFO) << "Build Program"; + if (CL_SUCCESS != iStatus) { + fprintf(stderr, "Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, + sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram (Program); + } } //Use to read OpenCL source code -cl_int Device::ConvertToString(std::string pFileName,std::string &Str){ - size_t uiSize=0; - size_t uiFileSize=0; - char *pStr=NULL; - char *tmp = (char*)pFileName.data(); - std::fstream fFile(tmp,(std::fstream::in|std::fstream::binary)); - if(fFile.is_open()){ - fFile.seekg(0,std::fstream::end); - uiSize=uiFileSize=(size_t)fFile.tellg(); - fFile.seekg(0,std::fstream::beg); - pStr=new char[uiSize+1]; - - if(NULL==pStr){ - fFile.close(); - return 0; - } - fFile.read(pStr,uiFileSize); - fFile.close(); - pStr[uiSize]='\0'; - Str=pStr; - delete[] pStr; - return 0; - } - LOG(ERROR) << "Err: Failed to open cl file!"; - return -1; +cl_int Device::ConvertToString(std::string pFileName, std::string &Str) { + size_t uiSize = 0; + size_t uiFileSize = 0; + char *pStr = NULL; + char *tmp = (char*) pFileName.data(); + std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary)); + if (fFile.is_open()) { + fFile.seekg(0, std::fstream::end); + uiSize = uiFileSize = (size_t) fFile.tellg(); + fFile.seekg(0, std::fstream::beg); + pStr = new char[uiSize + 1]; + + if (NULL == pStr) { + fFile.close(); + return 0; + } + fFile.read(pStr, uiFileSize); + fFile.close(); + pStr[uiSize] = '\0'; + Str = pStr; + delete[] pStr; + return 0; + } + LOG(ERROR) << "Err: Failed to open cl file!"; + return -1; } cl_kernel Device::GetKernel(std::string kernel_name) -{ - std::map::iterator it = Kernels.find(kernel_name); - if (it == Kernels.end()) - { - cl_int _err=0; - cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err); - OCL_CHECK(_err); - Kernels[kernel_name] = kernel; - } - return Kernels[kernel_name]; + { + std::map::iterator it = Kernels.find(kernel_name); + if (it == Kernels.end()) + { + cl_int _err = 0; + cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err); + OCL_CHECK(_err); + Kernels[kernel_name] = kernel; + } + return Kernels[kernel_name]; } void Device::ReleaseKernels() { - std::map::iterator it; - for (it = Kernels.begin(); it != Kernels.end(); it++) - { - clReleaseKernel(it->second); - } + std::map::iterator it; + for (it = Kernels.begin(); it != Kernels.end(); it++) + { + clReleaseKernel(it->second); + } } -void Device::DisplayPlatformInfo(){ - cl_int err; - - err = clGetPlatformIDs (0, NULL, &numPlatforms); - if (err != CL_SUCCESS || numPlatforms <=0) - { - LOG(ERROR) << "Failed to find any OpenCL platform."; - return; - } - - platformIDs = (cl_platform_id *) malloc (sizeof(cl_platform_id) * numPlatforms); - err = clGetPlatformIDs (numPlatforms, platformIDs, NULL); - if(err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find any OpenCL platform."; - return; - } - - LOG(INFO) << "Number of platforms found:" << numPlatforms; - - //iterate through the list of platforms displaying platform information - for (cl_uint i = 0; i < numPlatforms; i++ ){ - DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); - DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); - DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS"); - } - +void Device::DisplayPlatformInfo() { + cl_int err; + + err = clGetPlatformIDs(0, NULL, &numPlatforms); + if (err != CL_SUCCESS || numPlatforms <= 0) + { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + platformIDs = (cl_platform_id *) malloc( + sizeof(cl_platform_id) * numPlatforms); + err = clGetPlatformIDs(numPlatforms, platformIDs, NULL); + if (err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + LOG(INFO) << "Number of platforms found:" << numPlatforms; + + //iterate through the list of platforms displaying platform information + for (cl_uint i = 0; i < numPlatforms; i++) { + DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); + DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); + DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, + "CL_PLATFORM_EXTENSIONS"); + } + } -void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str){ - cl_int err; - std::size_t paramValueSize; - - err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); - if(err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL platform:" << str; - return; - } - - char * info = (char *) alloca (sizeof(char) * paramValueSize); - err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); - if(err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL platform:" << str; - return; - } - - LOG(INFO) << "\t" << str << "\t" << info; +void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + char * info = (char *) alloca(sizeof(char) * paramValueSize); + err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + LOG(INFO) << "\t" << str << "\t" << info; } -void Device::GetDeviceInfo(){ - cl_int err; - //by default, we select the first platform. can be extended for more platforms - //query GPU device for now - err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - // we allow program run if no GPU is found. Just return. No error reported. - if (numDevices < 1) - { - LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; - LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; - return; - } - - DeviceIDs = (cl_device_id *) malloc (sizeof(cl_device_id) * numDevices); - err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, DeviceIDs, NULL); - if (err != CL_SUCCESS) - { - LOG(INFO) << "Failed to find any GPU devices."; - return; - } - - LOG(INFO) << "Number of devices found:" << numDevices; - for (cl_uint i = 0; i < numDevices; i++) { - LOG(INFO) << "\t" << "DeviceID" << ":\t" <(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); - } - - +void Device::GetDeviceInfo() { + cl_int err; + //by default, we select the first platform. can be extended for more platforms + //query GPU device for now + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, + &numDevices); + // we allow program run if no GPU is found. Just return. No error reported. + if (numDevices < 1) + { + LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; + LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; + return; + } + + DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices); + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, + DeviceIDs, NULL); + if (err != CL_SUCCESS) + { + LOG(INFO) << "Failed to find any GPU devices."; + return; + } + + LOG(INFO) << "Number of devices found:" << numDevices; + for (cl_uint i = 0; i < numDevices; i++) { + LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i]; + DisplayDeviceInfo < cl_device_type + > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + DisplayDeviceInfo < size_t + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, + "Max work item sizes"); + DisplayDeviceInfo < cl_command_queue_properties + > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + DisplayDeviceInfo < cl_device_exec_capabilities + > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + } + } void Device::DeviceQuery() { - DisplayPlatformInfo(); - - clGetPlatformIDs(0, NULL, &numPlatforms); - cl_platform_id PlatformIDs[numPlatforms]; - clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); - - size_t nameLen; - cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen); - if (res != CL_SUCCESS) { - fprintf(stderr, "Err: Failed to Get Platform Info\n"); - return; - } - platformName[nameLen] = 0; - - GetDeviceInfo(); + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, + platformName, &nameLen); + if (res != CL_SUCCESS) { + fprintf(stderr, "Err: Failed to Get Platform Info\n"); + return; + } + platformName[nameLen] = 0; + + GetDeviceInfo(); } -template -void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){ - cl_int err; - std::size_t paramValueSize; - - err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); - if(err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL device info:" << str; - return; - } - - std::string content; - T * info = (T *) alloca (sizeof(T) * paramValueSize); - err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); - if(err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL device info:" << str; - return; - } - - - switch(name){ - case CL_DEVICE_TYPE: - { - std::string deviceType; - appendBitfield( - *(reinterpret_cast(info)),CL_DEVICE_TYPE_CPU,"CL_DEVICE_TYPE_CPU",deviceType); - - appendBitfield( - *(reinterpret_cast(info)),CL_DEVICE_TYPE_GPU,"CL_DEVICE_TYPE_GPU",deviceType); - - appendBitfield( - *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_ACCELERATOR,"CL_DEVICE_TYPE_ACCELERATOR",deviceType); - - appendBitfield( - *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_DEFAULT,"CL_DEVICE_TYPE_DEFAULT",deviceType); - - LOG(INFO) << "\t " << str << ":\t" << deviceType; - } - break; - case CL_DEVICE_EXECUTION_CAPABILITIES: - { - std::string memType; - appendBitfield( - *(reinterpret_cast(info)),CL_EXEC_KERNEL,"CL_EXEC_KERNEL",memType); - - appendBitfield( - *(reinterpret_cast(info)),CL_EXEC_NATIVE_KERNEL,"CL_EXEC_NATIVE_KERNEL",memType); - - LOG(INFO) << "\t " << str << ":\t" << memType; - - } - break; - case CL_DEVICE_QUEUE_PROPERTIES: - { - std::string memType; - appendBitfield(*(reinterpret_cast(info)),CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,"CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE",memType); - - appendBitfield(*(reinterpret_cast(info)),CL_QUEUE_PROFILING_ENABLE,"CL_QUEUE_PROFILING_ENABLE",memType); - - LOG(INFO) << "\t " << str << ":\t" << memType; - } - break; - default: - LOG(INFO) << "\t" << str << ":\t" << *info; - break; -} +template +void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + std::string content; + T * info = (T *) alloca(sizeof(T) * paramValueSize); + err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) + { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + switch (name) { + case CL_DEVICE_TYPE: + { + std::string deviceType; + appendBitfield < cl_device_type + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); + + appendBitfield < cl_device_type + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); + + appendBitfield < cl_device_type + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); + + appendBitfield < cl_device_type + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); + + LOG(INFO) << "\t " << str << ":\t" << deviceType; + } + break; + case CL_DEVICE_EXECUTION_CAPABILITIES: + { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > ( + *(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); + + appendBitfield < cl_device_exec_capabilities + > ( + *(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + + } + break; + case CL_DEVICE_QUEUE_PROPERTIES: + { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); + + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + } + break; + default: + LOG(INFO) << "\t" << str << ":\t" << *info; + break; + } } template -void Device::appendBitfield(T info, T value , std::string name , std::string &str) -{ - if(info & value) - { - if (str.length() > 0) - { - str.append(" | "); - } - str.append(name); - } +void Device::appendBitfield(T info, T value, std::string name, std::string &str) + { + if (info & value) + { + if (str.length() > 0) + { + str.append(" | "); + } + str.append(name); + } } - } // namespace caffe diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index c2d19d43..64f4fa6b 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -4,37 +4,36 @@ namespace caffe { InternalThread::~InternalThread() { - WaitForInternalThreadToExit(); + WaitForInternalThreadToExit(); } bool InternalThread::is_started() const { - return thread_.get() != NULL && thread_->joinable(); + return thread_.get() != NULL && thread_->joinable(); } - bool InternalThread::StartInternalThread() { - if (!WaitForInternalThreadToExit()) { - return false; - } - try { - thread_.reset( - new boost::thread(&InternalThread::InternalThreadEntry, this)); - } catch (...) { - return false; - } - return true; + if (!WaitForInternalThreadToExit()) { + return false; + } + try { + thread_.reset( + new boost::thread(&InternalThread::InternalThreadEntry, this)); + } catch (...) { + return false; + } + return true; } /** Will not return until the internal thread has exited. */ bool InternalThread::WaitForInternalThreadToExit() { - if (is_started()) { - try { - thread_->join(); - } catch (...) { - return false; - } - } - return true; + if (is_started()) { + try { + thread_->join(); + } catch (...) { + return false; + } + } + return true; } } // namespace caffe diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index 926c7d8f..4ff6e3d4 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -17,147 +17,147 @@ namespace caffe { // Get convolution layer according to engine. -template +template shared_ptr > GetConvolutionLayer( - const LayerParameter& param) { - ConvolutionParameter_Engine engine = param.convolution_param().engine(); - if (engine == ConvolutionParameter_Engine_DEFAULT) { - engine = ConvolutionParameter_Engine_CAFFE; + const LayerParameter& param) { + ConvolutionParameter_Engine engine = param.convolution_param().engine(); + if (engine == ConvolutionParameter_Engine_DEFAULT) { + engine = ConvolutionParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = ConvolutionParameter_Engine_CUDNN; + engine = ConvolutionParameter_Engine_CUDNN; #endif - } - if (engine == ConvolutionParameter_Engine_CAFFE) { - return shared_ptr >(new ConvolutionLayer(param)); + } + if (engine == ConvolutionParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new ConvolutionLayer(param)); #ifdef USE_CUDNN - } else if (engine == ConvolutionParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNConvolutionLayer(param)); + } else if (engine == ConvolutionParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNConvolutionLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer); // Get pooling layer according to engine. -template +template shared_ptr > GetPoolingLayer(const LayerParameter& param) { - PoolingParameter_Engine engine = param.pooling_param().engine(); - if (engine == PoolingParameter_Engine_DEFAULT) { - engine = PoolingParameter_Engine_CAFFE; + PoolingParameter_Engine engine = param.pooling_param().engine(); + if (engine == PoolingParameter_Engine_DEFAULT) { + engine = PoolingParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = PoolingParameter_Engine_CUDNN; + engine = PoolingParameter_Engine_CUDNN; #endif - } - if (engine == PoolingParameter_Engine_CAFFE) { - return shared_ptr >(new PoolingLayer(param)); + } + if (engine == PoolingParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new PoolingLayer(param)); #ifdef USE_CUDNN - } else if (engine == PoolingParameter_Engine_CUDNN) { - PoolingParameter p_param = param.pooling_param(); - if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || - param.top_size() > 1) { - LOG(INFO) << "CUDNN does not support padding or multiple tops. " - << "Using Caffe's own pooling layer."; - return shared_ptr >(new PoolingLayer(param)); - } - return shared_ptr >(new CuDNNPoolingLayer(param)); + } else if (engine == PoolingParameter_Engine_CUDNN) { + PoolingParameter p_param = param.pooling_param(); + if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || + param.top_size() > 1) { + LOG(INFO) << "CUDNN does not support padding or multiple tops. " + << "Using Caffe's own pooling layer."; + return shared_ptr >(new PoolingLayer(param)); + } + return shared_ptr >(new CuDNNPoolingLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); // Get relu layer according to engine. -template +template shared_ptr > GetReLULayer(const LayerParameter& param) { - ReLUParameter_Engine engine = param.relu_param().engine(); - if (engine == ReLUParameter_Engine_DEFAULT) { - engine = ReLUParameter_Engine_CAFFE; + ReLUParameter_Engine engine = param.relu_param().engine(); + if (engine == ReLUParameter_Engine_DEFAULT) { + engine = ReLUParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = ReLUParameter_Engine_CUDNN; + engine = ReLUParameter_Engine_CUDNN; #endif - } - if (engine == ReLUParameter_Engine_CAFFE) { - return shared_ptr >(new ReLULayer(param)); + } + if (engine == ReLUParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new ReLULayer(param)); #ifdef USE_CUDNN - } else if (engine == ReLUParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNReLULayer(param)); + } else if (engine == ReLUParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNReLULayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(ReLU, GetReLULayer); // Get sigmoid layer according to engine. -template +template shared_ptr > GetSigmoidLayer(const LayerParameter& param) { - SigmoidParameter_Engine engine = param.sigmoid_param().engine(); - if (engine == SigmoidParameter_Engine_DEFAULT) { - engine = SigmoidParameter_Engine_CAFFE; + SigmoidParameter_Engine engine = param.sigmoid_param().engine(); + if (engine == SigmoidParameter_Engine_DEFAULT) { + engine = SigmoidParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = SigmoidParameter_Engine_CUDNN; + engine = SigmoidParameter_Engine_CUDNN; #endif - } - if (engine == SigmoidParameter_Engine_CAFFE) { - return shared_ptr >(new SigmoidLayer(param)); + } + if (engine == SigmoidParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new SigmoidLayer(param)); #ifdef USE_CUDNN - } else if (engine == SigmoidParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNSigmoidLayer(param)); + } else if (engine == SigmoidParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNSigmoidLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); // Get softmax layer according to engine. -template +template shared_ptr > GetSoftmaxLayer(const LayerParameter& param) { - SoftmaxParameter_Engine engine = param.softmax_param().engine(); - if (engine == SoftmaxParameter_Engine_DEFAULT) { - engine = SoftmaxParameter_Engine_CAFFE; + SoftmaxParameter_Engine engine = param.softmax_param().engine(); + if (engine == SoftmaxParameter_Engine_DEFAULT) { + engine = SoftmaxParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = SoftmaxParameter_Engine_CUDNN; + engine = SoftmaxParameter_Engine_CUDNN; #endif - } - if (engine == SoftmaxParameter_Engine_CAFFE) { - return shared_ptr >(new SoftmaxLayer(param)); + } + if (engine == SoftmaxParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new SoftmaxLayer(param)); #ifdef USE_CUDNN - } else if (engine == SoftmaxParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNSoftmaxLayer(param)); + } else if (engine == SoftmaxParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNSoftmaxLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer); // Get tanh layer according to engine. -template +template shared_ptr > GetTanHLayer(const LayerParameter& param) { - TanHParameter_Engine engine = param.tanh_param().engine(); - if (engine == TanHParameter_Engine_DEFAULT) { - engine = TanHParameter_Engine_CAFFE; + TanHParameter_Engine engine = param.tanh_param().engine(); + if (engine == TanHParameter_Engine_DEFAULT) { + engine = TanHParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = TanHParameter_Engine_CUDNN; + engine = TanHParameter_Engine_CUDNN; #endif - } - if (engine == TanHParameter_Engine_CAFFE) { - return shared_ptr >(new TanHLayer(param)); + } + if (engine == TanHParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new TanHLayer(param)); #ifdef USE_CUDNN - } else if (engine == TanHParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNTanHLayer(param)); + } else if (engine == TanHParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNTanHLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(TanH, GetTanHLayer); @@ -165,15 +165,15 @@ REGISTER_LAYER_CREATOR(TanH, GetTanHLayer); #ifdef WITH_PYTHON_LAYER template shared_ptr > GetPythonLayer(const LayerParameter& param) { - Py_Initialize(); - try { - bp::object module = bp::import(param.python_param().module().c_str()); - bp::object layer = module.attr(param.python_param().layer().c_str())(param); - return bp::extract > >(layer)(); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } + Py_Initialize(); + try { + bp::object module = bp::import(param.python_param().module().c_str()); + bp::object layer = module.attr(param.python_param().layer().c_str())(param); + return bp::extract > >(layer)(); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } REGISTER_LAYER_CREATOR(Python, GetPythonLayer); @@ -181,4 +181,5 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer); // Layers that use their constructor as their default creator should be // registered in their corresponding cpp files. Do not register them here. -} // namespace caffe +} + // namespace caffe diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 12776eb8..cd99296e 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -6,61 +6,61 @@ namespace caffe { -template +template void AbsValLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " + "allow in-place computation."; } -template +template void AbsValLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_cpu_data(); - caffe_abs(count, bottom[0]->cpu_data(), top_data); + const vector*>& bottom, const vector*>& top) { + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_cpu_data(); + caffe_abs(count, bottom[0]->cpu_data(), top_data); } -template +template void AbsValLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_cpu_sign(count, bottom_data, bottom_diff); - caffe_mul(count, bottom_diff, top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->cpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_cpu_sign(count, bottom_data, bottom_diff); + caffe_mul(count, bottom_diff, top_diff, bottom_diff); + } } -template +template void AbsValLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); + const vector*>& top) { + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); } -template +template void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_sign(count, bottom_data, bottom_diff); + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU(AbsValLayer); #endif -INSTANTIATE_CLASS(AbsValLayer); -REGISTER_LAYER_CLASS(AbsVal); +INSTANTIATE_CLASS (AbsValLayer); +REGISTER_LAYER_CLASS (AbsVal); } // namespace caffe diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 90aad675..82f92e27 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -10,82 +10,82 @@ namespace caffe { -template +template void AccuracyLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - top_k_ = this->layer_param_.accuracy_param().top_k(); + const vector*>& bottom, const vector*>& top) { + top_k_ = this->layer_param_.accuracy_param().top_k(); - has_ignore_label_ = - this->layer_param_.accuracy_param().has_ignore_label(); - if (has_ignore_label_) { - ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); - } + has_ignore_label_ = + this->layer_param_.accuracy_param().has_ignore_label(); + if (has_ignore_label_) { + ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); + } } -template +template void AccuracyLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) - << "top_k must be less than or equal to the number of classes."; - label_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); - outer_num_ = bottom[0]->count(0, label_axis_); - inner_num_ = bottom[0]->count(label_axis_ + 1); - CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; - vector top_shape(0); // Accuracy is a scalar; 0 axes. - top[0]->Reshape(top_shape); + const vector*>& bottom, const vector*>& top) { + CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) + << "top_k must be less than or equal to the number of classes."; + label_axis_ = + bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); + outer_num_ = bottom[0]->count(0, label_axis_); + inner_num_ = bottom[0]->count(label_axis_ + 1); + CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) + << "Number of labels must match number of predictions; " + << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; + vector top_shape(0); // Accuracy is a scalar; 0 axes. + top[0]->Reshape(top_shape); } -template +template void AccuracyLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype accuracy = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const int dim = bottom[0]->count() / outer_num_; - const int num_labels = bottom[0]->shape(label_axis_); - vector maxval(top_k_+1); - vector max_id(top_k_+1); - int count = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; ++j) { - const int label_value = - static_cast(bottom_label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - continue; - } - DCHECK_GE(label_value, 0); - DCHECK_LT(label_value, num_labels); - // Top-k accuracy - std::vector > bottom_data_vector; - for (int k = 0; k < num_labels; ++k) { - bottom_data_vector.push_back(std::make_pair( - bottom_data[i * dim + k * inner_num_ + j], k)); - } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); - // check if true label is in top k predictions - for (int k = 0; k < top_k_; k++) { - if (bottom_data_vector[k].second == label_value) { - ++accuracy; - break; - } - } - ++count; - } - } + const vector*>& top) { + Dtype accuracy = 0; + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const int dim = bottom[0]->count() / outer_num_; + const int num_labels = bottom[0]->shape(label_axis_); + vector < Dtype > maxval(top_k_ + 1); + vector max_id(top_k_ + 1); + int count = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; ++j) { + const int label_value = + static_cast(bottom_label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + continue; + } + DCHECK_GE(label_value, 0); + DCHECK_LT(label_value, num_labels); + // Top-k accuracy + std::vector < std::pair > bottom_data_vector; + for (int k = 0; k < num_labels; ++k) { + bottom_data_vector.push_back(std::make_pair( + bottom_data[i * dim + k * inner_num_ + j], k)); + } + std::partial_sort( + bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, + bottom_data_vector.end(), std::greater >()); + // check if true label is in top k predictions + for (int k = 0; k < top_k_; k++) { + if (bottom_data_vector[k].second == label_value) { + ++accuracy; + break; + } + } + ++count; + } + } - // LOG(INFO) << "Accuracy: " << accuracy; - top[0]->mutable_cpu_data()[0] = accuracy / count; - // Accuracy layer should not be used as a loss function. + // LOG(INFO) << "Accuracy: " << accuracy; + top[0]->mutable_cpu_data()[0] = accuracy / count; + // Accuracy layer should not be used as a loss function. } -INSTANTIATE_CLASS(AccuracyLayer); -REGISTER_LAYER_CLASS(Accuracy); +INSTANTIATE_CLASS (AccuracyLayer); +REGISTER_LAYER_CLASS (Accuracy); } // namespace caffe diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index c4040cdc..87cc706e 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -8,56 +8,56 @@ namespace caffe { -template +template void ArgMaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - out_max_val_ = this->layer_param_.argmax_param().out_max_val(); - top_k_ = this->layer_param_.argmax_param().top_k(); - CHECK_GE(top_k_, 1) << " top k must not be less than 1."; - CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num()) - << "top_k must be less than or equal to the number of classes."; + const vector*>& top) { + out_max_val_ = this->layer_param_.argmax_param().out_max_val(); + top_k_ = this->layer_param_.argmax_param().top_k(); + CHECK_GE(top_k_, 1) << " top k must not be less than 1."; + CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num()) + << "top_k must be less than or equal to the number of classes."; } -template +template void ArgMaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - if (out_max_val_) { - // Produces max_ind and max_val - top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); - } else { - // Produces only max_ind - top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1); - } + const vector*>& top) { + if (out_max_val_) { + // Produces max_ind and max_val + top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); + } else { + // Produces only max_ind + top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1); + } } -template +template void ArgMaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - for (int i = 0; i < num; ++i) { - std::vector > bottom_data_vector; - for (int j = 0; j < dim; ++j) { - bottom_data_vector.push_back( - std::make_pair(bottom_data[i * dim + j], j)); - } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); - for (int j = 0; j < top_k_; ++j) { - top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; - } - if (out_max_val_) { - for (int j = 0; j < top_k_; ++j) { - top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first; - } - } - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + for (int i = 0; i < num; ++i) { + std::vector < std::pair > bottom_data_vector; + for (int j = 0; j < dim; ++j) { + bottom_data_vector.push_back( + std::make_pair(bottom_data[i * dim + j], j)); + } + std::partial_sort( + bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, + bottom_data_vector.end(), std::greater >()); + for (int j = 0; j < top_k_; ++j) { + top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; + } + if (out_max_val_) { + for (int j = 0; j < top_k_; ++j) { + top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first; + } + } + } } -INSTANTIATE_CLASS(ArgMaxLayer); -REGISTER_LAYER_CLASS(ArgMax); +INSTANTIATE_CLASS (ArgMaxLayer); +REGISTER_LAYER_CLASS (ArgMax); } // namespace caffe diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 394fd9a5..97c9afd3 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -11,447 +11,468 @@ namespace caffe { #ifdef use_packing_scheme template size_t BaseConvolutionLayer::subtop_mem_size = sizeof(Dtype); -template size_t BaseConvolutionLayer::trans_mem_size = sizeof(Dtype); +template size_t BaseConvolutionLayer::trans_mem_size = sizeof(Dtype); template cl_mem BaseConvolutionLayer::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::subtop_mem_size, NULL, NULL); template cl_mem BaseConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); #endif -template +template void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) -{ - if(subtop_size > BaseConvolutionLayer::subtop_mem_size){ - ConvolutionLayer::subtop_mem_size = subtop_size; - clReleaseMemObject(ConvolutionLayer::subTopMem); - ConvolutionLayer::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::subtop_mem_size, NULL, NULL); - } - if(trans_size > ConvolutionLayer::trans_mem_size){ - ConvolutionLayer::trans_mem_size = trans_size; - clReleaseMemObject(ConvolutionLayer::transMem); - ConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); - } + { + if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) { + ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem); + ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, NULL, + NULL); + } + if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) { + ConvolutionLayer < Dtype > ::trans_mem_size = trans_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem); + ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, NULL, + NULL); + } } -template +template void BaseConvolutionLayer::ocl_setup() { - M_ = num_output_ / group_; - K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; - N_ = height_out_ * width_out_; + M_ = num_output_ / group_; + K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; + N_ = height_out_ * width_out_; #ifdef use_packing_scheme - size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); - size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); - Alloc_public_tmp_mem(subtop_size, trans_size); + size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); + size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); + Alloc_public_tmp_mem(subtop_size, trans_size); #endif } - -template - BaseConvolutionLayer::~BaseConvolutionLayer(){ +template +BaseConvolutionLayer::~BaseConvolutionLayer() { } - -template +template void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - // Configure the kernel size, padding, stride, and inputs. - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - if (conv_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = conv_param.kernel_size(); - } else { - kernel_h_ = conv_param.kernel_h(); - kernel_w_ = conv_param.kernel_w(); - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!conv_param.has_pad_h()) { - pad_h_ = pad_w_ = conv_param.pad(); - } else { - pad_h_ = conv_param.pad_h(); - pad_w_ = conv_param.pad_w(); - } - if (!conv_param.has_stride_h()) { - stride_h_ = stride_w_ = conv_param.stride(); - } else { - stride_h_ = conv_param.stride_h(); - stride_w_ = conv_param.stride_w(); - } - // Special case: im2col is the identity for 1x1 convolution with stride 1 - // and no padding, so flag for skipping the buffer and transformation. - is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 - && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; - // Configure output channels and groups. - channels_ = bottom[0]->channels(); - num_output_ = this->layer_param_.convolution_param().num_output(); - CHECK_GT(num_output_, 0); - group_ = this->layer_param_.convolution_param().group(); - CHECK_EQ(channels_ % group_, 0); - CHECK_EQ(num_output_ % group_, 0) - << "Number of output should be multiples of group."; - if (reverse_dimensions()) { - conv_out_channels_ = channels_; - conv_in_channels_ = num_output_; - } else { - conv_out_channels_ = num_output_; - conv_in_channels_ = channels_; - } - - - // Handle the parameters: weights and biases. - // - blobs_[0] holds the filter weights - // - blobs_[1] holds the biases (optional) - bias_term_ = this->layer_param_.convolution_param().bias_term(); - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - if (bias_term_) { - this->blobs_.resize(2); - } else { - this->blobs_.resize(1); - } - // Initialize and fill the weights: - // output channels x input channels per-group x kernel height x kernel width - this->blobs_[0].reset(new Blob( - conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); - shared_ptr > weight_filler(GetFiller( - this->layer_param_.convolution_param().weight_filler())); - weight_filler->Fill(this->blobs_[0].get()); - // If necessary, initialize and fill the biases. - if (bias_term_) { - vector bias_shape(1, num_output_); - this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr > bias_filler(GetFiller( - this->layer_param_.convolution_param().bias_filler())); - bias_filler->Fill(this->blobs_[1].get()); - } - } - // Propagate gradients to the parameters (as directed by backward pass). - this->param_propagate_down_.resize(this->blobs_.size(), true); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + // Configure the kernel size, padding, stride, and inputs. + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK(!conv_param.has_kernel_size() != + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK(conv_param.has_kernel_size() || + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK((!conv_param.has_pad() && conv_param.has_pad_h() + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK((!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } + // Special case: im2col is the identity for 1x1 convolution with stride 1 + // and no padding, so flag for skipping the buffer and transformation. + is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 + && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; + // Configure output channels and groups. + channels_ = bottom[0]->channels(); + num_output_ = this->layer_param_.convolution_param().num_output(); + CHECK_GT(num_output_, 0); + group_ = this->layer_param_.convolution_param().group(); + CHECK_EQ(channels_ % group_, 0); + CHECK_EQ(num_output_ % group_, 0) + << "Number of output should be multiples of group."; + if (reverse_dimensions()) { + conv_out_channels_ = channels_; + conv_in_channels_ = num_output_; + } else { + conv_out_channels_ = num_output_; + conv_in_channels_ = channels_; + } + + // Handle the parameters: weights and biases. + // - blobs_[0] holds the filter weights + // - blobs_[1] holds the biases (optional) + bias_term_ = this->layer_param_.convolution_param().bias_term(); + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + if (bias_term_) { + this->blobs_.resize(2); + } else { + this->blobs_.resize(1); + } + // Initialize and fill the weights: + // output channels x input channels per-group x kernel height x kernel width + this->blobs_[0].reset(new Blob( + conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); + shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( + this->layer_param_.convolution_param().weight_filler())); + weight_filler->Fill(this->blobs_[0].get()); + // If necessary, initialize and fill the biases. + if (bias_term_) { + vector bias_shape(1, num_output_); + this->blobs_[1].reset(new Blob(bias_shape)); + shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( + this->layer_param_.convolution_param().bias_filler())); + bias_filler->Fill(this->blobs_[1].get()); + } + } + // Propagate gradients to the parameters (as directed by backward pass). + this->param_propagate_down_.resize(this->blobs_.size(), true); } -template +template void BaseConvolutionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" - " convolution kernel."; - // TODO: generalize to handle inputs of different shapes. - for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { - CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; - CHECK_EQ(channels_, bottom[bottom_id]->channels()) - << "Inputs must have same channels."; - CHECK_EQ(height_, bottom[bottom_id]->height()) - << "Inputs must have same height."; - CHECK_EQ(width_, bottom[bottom_id]->width()) - << "Inputs must have same width."; - } - // Shape the tops. - compute_output_shape(); - for (int top_id = 0; top_id < top.size(); ++top_id) { - top[top_id]->Reshape(num_, num_output_, height_out_, width_out_); - } - if (reverse_dimensions()) { - conv_in_height_ = height_out_; - conv_in_width_ = width_out_; - conv_out_spatial_dim_ = height_ * width_; - } else { - conv_in_height_ = height_; - conv_in_width_ = width_; - conv_out_spatial_dim_ = height_out_ * width_out_; - } - kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_; - weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_; - col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_; - output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; - // The im2col result buffer will only hold one image at a time to avoid - // overly large memory usage. In the special case of 1x1 convolution - // it goes lazily unused to save memory. - if (reverse_dimensions()) { - col_buffer_.Reshape(1, kernel_dim_, height_, width_); - } else { - col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_); - } - // Set up the all ones "bias multiplier" for adding biases by BLAS - if (bias_term_) { - vector bias_multiplier_shape(1, height_out_ * width_out_); - bias_multiplier_.Reshape(bias_multiplier_shape); - caffe_set(bias_multiplier_.count(), Dtype(1), - bias_multiplier_.mutable_cpu_data()); - } - //initializa OpenCL kernels and cl_mem objects - ocl_setup(); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" + " convolution kernel."; + // TODO: generalize to handle inputs of different shapes. + for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { + CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; + CHECK_EQ(channels_, bottom[bottom_id]->channels()) + << "Inputs must have same channels."; + CHECK_EQ(height_, bottom[bottom_id]->height()) + << "Inputs must have same height."; + CHECK_EQ(width_, bottom[bottom_id]->width()) + << "Inputs must have same width."; + } + // Shape the tops. + compute_output_shape(); + for (int top_id = 0; top_id < top.size(); ++top_id) { + top[top_id]->Reshape(num_, num_output_, height_out_, width_out_); + } + if (reverse_dimensions()) { + conv_in_height_ = height_out_; + conv_in_width_ = width_out_; + conv_out_spatial_dim_ = height_ * width_; + } else { + conv_in_height_ = height_; + conv_in_width_ = width_; + conv_out_spatial_dim_ = height_out_ * width_out_; + } + kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_; + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_; + col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // The im2col result buffer will only hold one image at a time to avoid + // overly large memory usage. In the special case of 1x1 convolution + // it goes lazily unused to save memory. + if (reverse_dimensions()) { + col_buffer_.Reshape(1, kernel_dim_, height_, width_); + } else { + col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_); + } + // Set up the all ones "bias multiplier" for adding biases by BLAS + if (bias_term_) { + vector bias_multiplier_shape(1, height_out_ * width_out_); + bias_multiplier_.Reshape(bias_multiplier_shape); + caffe_set(bias_multiplier_.count(), Dtype(1), + bias_multiplier_.mutable_cpu_data()); + } + //initializa OpenCL kernels and cl_mem objects + ocl_setup(); } -template +template void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); - } - col_buff = col_buffer_.cpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype)0., output + output_offset_ * g); - } + const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); + } + col_buff = col_buffer_.cpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / + group_, conv_out_spatial_dim_, kernel_dim_ / group_, + (Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g, + (Dtype) 0., output + output_offset_ * g); + } } -template +template void BaseConvolutionLayer::forward_cpu_bias(Dtype* output, - const Dtype* bias) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(), - (Dtype)1., output); + const Dtype* bias) { + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, + height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), + (Dtype) 1., output); } -template +template void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { - Dtype* col_buff = col_buffer_.mutable_cpu_data(); - if (is_1x1_) { - col_buff = input; - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype)0., col_buff + col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_cpu(col_buff, input); - } + const Dtype* weights, Dtype* input) { + Dtype* col_buff = col_buffer_.mutable_cpu_data(); + if (is_1x1_) { + col_buff = input; + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, + conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g, + (Dtype) 0., col_buff + col_offset_ * g); + } + if (!is_1x1_) { + conv_col2im_cpu(col_buff, input); + } } -template +template void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; - if (!is_1x1_) { - conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); - col_buff = col_buffer_.cpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype)1., weights + weight_offset_ * g); - } + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + if (!is_1x1_) { + conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); + col_buff = col_buffer_.cpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, + kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g, + (Dtype) 1., weights + weight_offset_ * g); + } } -template +template void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias, - const Dtype* input) { - caffe_cpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.cpu_data(), 1., bias); + const Dtype* input) { + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., + input, bias_multiplier_.cpu_data(), 1., bias); } #ifndef CPU_ONLY -template +template void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - } - col_buff = col_buffer_.gpu_data(); - } - - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, - conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g, - (Dtype)0., output, top_offset_+output_offset_ * g); - } + const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + } + col_buff = col_buffer_.gpu_data(); + } + + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, + conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ + / group_, + (Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g, + (Dtype) 0., output, top_offset_ + output_offset_ * g); + } } -template -void BaseConvolutionLayer::forward_gpu_gemm_opt (const Dtype* input, - const Dtype* weight, Dtype* output, bool skip_im2col) { - cl_command_queue Queue; - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_gpu_opt(input); - } - col_buff = col_buffer_.gpu_data(); - }else{ - caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem); - } +template +void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, + const Dtype* weight, Dtype* output, bool skip_im2col) { + cl_command_queue Queue; + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_gpu_opt(input); + } + col_buff = col_buffer_.gpu_data(); + } else { + caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, + (Dtype*) transMem); + } #ifdef multiQ - for (int g = 0; g < group_; ++g) { - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); - } - if(group_ == 2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + for (int g = 0; g < group_; ++g) { + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); + } + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #else - Queue = amdDevice.CommandQueue; - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); - } + Queue = amdDevice.CommandQueue; + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_ + * g, + (Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g); + } #endif - transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2); + transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_, + opt_num2); } - -template +template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, - const Dtype* bias) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - height_out_*width_out_, 1, (Dtype)1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., output, top_offset_); + const Dtype* bias) { + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, + height_out_ * width_out_, 1, (Dtype) 1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype) 1., output, top_offset_); } -template +template void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, - const Dtype* bias) { - for (int z = 0; z < opt_num2; z++) - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype)1., output, top_offset_ + num_output_ * N_ * z); + const Dtype* bias) { + for (int z = 0; z < opt_num2; z++) + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, + N_, 1, (Dtype) 1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype) 1., output, top_offset_ + num_output_ * N_ * z); } -template +template void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { - Dtype* col_buff = col_buffer_.mutable_gpu_data(); - if (is_1x1_) { - col_buff = input; - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype)1., weights, weight_offset_ * g, - output, top_offset_ + output_offset_ * g, - (Dtype)0., col_buff, col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_gpu(col_buff, input); - } + const Dtype* weights, Dtype* input) { + Dtype* col_buff = col_buffer_.mutable_gpu_data(); + if (is_1x1_) { + col_buff = input; + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype) 1., weights, weight_offset_ * g, + output, top_offset_ + output_offset_ * g, + (Dtype) 0., col_buff, col_offset_ * g); + } + if (!is_1x1_) { + conv_col2im_gpu(col_buff, input); + } } -template +template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, - const Dtype* weights, Dtype* input) { - cl_command_queue Queue; - if (is_1x1_) { - caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem); - } - for (int g = 0; g < group_; ++g) { + const Dtype* weights, Dtype* input) { + cl_command_queue Queue; + if (is_1x1_) { + caffe_gpu_memcpy( + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, + (Dtype*) transMem); + } + for (int g = 0; g < group_; ++g) { #ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; #else - Queue = amdDevice.CommandQueue; + Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, - (Dtype)1., weights, weight_offset_ * g, - (Dtype*)subTopMem, top_offset_opt * g, - (Dtype)0., (Dtype*)transMem, col_offset_ * g); - } + caffe_gpu_gemm < Dtype + > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, + (Dtype) 1., weights, weight_offset_ * g, + (Dtype*) subTopMem, top_offset_opt * g, + (Dtype) 0., (Dtype*) transMem, col_offset_ * g); + } #ifdef multiQ - if(group_ ==2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + if(group_ ==2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #endif - if (!is_1x1_) { - conv_col2im_gpu_opt(input); - }else{ - caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), (Dtype*)transMem, input); - } + if (!is_1x1_) { + conv_col2im_gpu_opt(input); + } else { + caffe_gpu_memcpy( + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), + (Dtype*) transMem, input); + } } -template +template void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; - if (!is_1x1_) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - col_buff = col_buffer_.gpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype)1., output, top_offset_, - (Dtype*)col_buff, col_offset_ * g, (Dtype)1., - (Dtype*)weights, weight_offset_ * g); - } + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + if (!is_1x1_) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + col_buff = col_buffer_.gpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype) 1., output, top_offset_, + (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., + (Dtype*) weights, weight_offset_ * g); + } } -template +template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, - const Dtype* output, Dtype* weights) { - cl_command_queue Queue; - if (!is_1x1_) { - conv_im2col_gpu_opt(input); - }else{ - caffe_gpu_memcpy( K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem); - } - opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2); - - for (int g = 0; g < group_; ++g) { + const Dtype* output, Dtype* weights) { + cl_command_queue Queue; + if (!is_1x1_) { + conv_im2col_gpu_opt(input); + } else { + caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, + (Dtype*) transMem); + } + opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + + for (int g = 0; g < group_; ++g) { #ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; #else - Queue = amdDevice.CommandQueue; + Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, - (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g, - (Dtype*)transMem, col_offset_ * g, (Dtype)1., - (Dtype*)weights, weight_offset_ * g); + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + (Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g, + (Dtype*) transMem, col_offset_ * g, (Dtype) 1., + (Dtype*) weights, weight_offset_ * g); #ifdef multiQ - if(group_ == 2){ - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #endif - } + } } -template +template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, - const Dtype* input) { - caffe_gpu_gemv(CblasNoTrans, num_output_, N_, - (Dtype)1., input, top_offset_, N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1, - bias, (size_t)0, 1); + const Dtype* input) { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num_output_, N_, + (Dtype) 1., input, top_offset_, N_, + reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, + bias, (size_t) 0, 1); } #endif // !CPU_ONLY -INSTANTIATE_CLASS(BaseConvolutionLayer); +INSTANTIATE_CLASS (BaseConvolutionLayer); } // namespace caffe diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 5ba0f2e5..f9a80979 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -7,109 +7,118 @@ namespace caffe { -template +template BaseDataLayer::BaseDataLayer(const LayerParameter& param) - : Layer(param), - transform_param_(param.transform_param()) { + : Layer(param), + transform_param_(param.transform_param()) { } -template +template void BaseDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - if (top.size() == 1) { - output_labels_ = false; - } else { - output_labels_ = true; - } - data_transformer_.reset( - new DataTransformer(transform_param_, this->phase_)); - data_transformer_->InitRand(); - // The subclasses should setup the size of bottom and top - DataLayerSetUp(bottom, top); + const vector*>& top) { + if (top.size() == 1) { + output_labels_ = false; + } else { + output_labels_ = true; + } + data_transformer_.reset( + new DataTransformer(transform_param_, this->phase_)); + data_transformer_->InitRand(); + // The subclasses should setup the size of bottom and top + DataLayerSetUp(bottom, top); } -template +template void BasePrefetchingDataLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - BaseDataLayer::LayerSetUp(bottom, top); - // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make - // simultaneous cudaMalloc calls when the main thread is running. In some - // GPUs this seems to cause failures if we do not so. - this->prefetch_data_.mutable_cpu_data(); - if (this->output_labels_) { - this->prefetch_label_.mutable_cpu_data(); - } - DLOG(INFO) << "Initializing prefetch"; - this->CreatePrefetchThread(); - DLOG(INFO) << "Prefetch initialized."; + const vector*>& bottom, const vector*>& top) { + BaseDataLayer < Dtype > ::LayerSetUp(bottom, top); + // Now, start the prefetch thread. Before calling prefetch, we make two + // cpu_data calls so that the prefetch thread does not accidentally make + // simultaneous cudaMalloc calls when the main thread is running. In some + // GPUs this seems to cause failures if we do not so. + this->prefetch_data_.mutable_cpu_data(); + if (this->output_labels_) { + this->prefetch_label_.mutable_cpu_data(); + } + DLOG(INFO) << "Initializing prefetch"; + this->CreatePrefetchThread(); + DLOG(INFO) << "Prefetch initialized."; } -template +template void BasePrefetchingDataLayer::CreatePrefetchThread() { - this->data_transformer_->InitRand(); - CHECK(StartInternalThread()) << "Thread execution failed"; + this->data_transformer_->InitRand(); + CHECK(StartInternalThread()) << "Thread execution failed"; } -template +template void BasePrefetchingDataLayer::JoinPrefetchThread() { - CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; + CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; } -template +template void BasePrefetchingDataLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // First, join the thread - JoinPrefetchThread(); - - DLOG(INFO) << "Thread joined"; - // Reshape to loaded data. - top[0]->ReshapeLike(prefetch_data_); - // Copy the data - caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_cpu_data()); - DLOG(INFO) << "Prefetch copied"; - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - // Copy the labels. - caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_cpu_data()); - } - // Start a new prefetch thread - DLOG(INFO) << "CreatePrefetchThread"; - CreatePrefetchThread(); + const vector*>& bottom, const vector*>& top) { + // First, join the thread + JoinPrefetchThread(); + + DLOG(INFO) << "Thread joined"; + // Reshape to loaded data. + top[0]->ReshapeLike(prefetch_data_); + // Copy the data + caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), + top[0]->mutable_cpu_data()); + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); + // Copy the labels. + caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), + top[1]->mutable_cpu_data()); + } + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); } -template -void BasePrefetchingDataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - - JoinPrefetchThread(); - DLOG(INFO) << "Thread joined"; - - top[0]->ReshapeLike(this->prefetch_data_); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) ); - DLOG(INFO) << "Prefetch copied"; - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) ); - } - +template +void BasePrefetchingDataLayer::Forward_gpu( + const vector*>& bottom, + const vector*>& top) { + + JoinPrefetchThread(); + DLOG(INFO) << "Thread joined"; + + top[0]->ReshapeLike(this->prefetch_data_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, + NULL, NULL)); + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), 0, + NULL, NULL)); + } + #ifdef Track_data_transfer #endif - - // Start a new prefetch thread - DLOG(INFO) << "CreatePrefetchThread"; - CreatePrefetchThread(); + + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); } #ifdef CPU_ONLY STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); #endif -INSTANTIATE_CLASS(BaseDataLayer); -INSTANTIATE_CLASS(BasePrefetchingDataLayer); +INSTANTIATE_CLASS (BaseDataLayer); +INSTANTIATE_CLASS (BasePrefetchingDataLayer); } // namespace caffe diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 3fe6f42e..8f72f41b 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -8,65 +8,66 @@ namespace caffe { const float kBNLL_THRESHOLD = 50.; -template +template void BNLLLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = bottom_data[i] > 0 ? - bottom_data[i] + log(1. + exp(-bottom_data[i])) : - log(1. + exp(bottom_data[i])); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = + bottom_data[i] > 0 ? + bottom_data[i] + log(1. + exp(-bottom_data[i])) : + log(1. + exp(bottom_data[i])); + } } -template +template void BNLLLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype expval; - for (int i = 0; i < count; ++i) { - expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD))); - bottom_diff[i] = top_diff[i] * expval / (expval + 1.); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype expval; + for (int i = 0; i < count; ++i) { + expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD))); + bottom_diff[i] = top_diff[i] * expval / (expval + 1.); + } + } } -template +template void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLForward(count, bottom_data, top_data); } -template +template void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLBackward(count, top_diff, bottom_data, bottom_diff); - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLBackward(count, top_diff, bottom_data, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU(BNLLLayer); #endif -INSTANTIATE_CLASS(BNLLLayer); -REGISTER_LAYER_CLASS(BNLL); +INSTANTIATE_CLASS (BNLLLayer); +REGISTER_LAYER_CLASS (BNLL); } // namespace caffe diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 6bc8f9e9..b885d9e6 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -6,133 +6,141 @@ namespace caffe { -template +template void ConcatLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const ConcatParameter& concat_param = this->layer_param_.concat_param(); - CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) - << "Either axis or concat_dim should be specified; not both."; + const vector*>& top) { + const ConcatParameter& concat_param = this->layer_param_.concat_param(); + CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) + << "Either axis or concat_dim should be specified; not both."; } -template +template void ConcatLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int num_axes = bottom[0]->num_axes(); - const ConcatParameter& concat_param = this->layer_param_.concat_param(); - if (concat_param.has_concat_dim()) { - concat_axis_ = static_cast(concat_param.concat_dim()); - // Don't allow negative indexing for concat_dim, a uint32 -- almost - // certainly unintended. - CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 " - << "produced negative result; concat_dim must satisfy " - << "0 <= concat_dim < " << kMaxBlobAxes; - CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range."; - } else { - concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis()); - } - // Initialize with the first blob. - vector top_shape = bottom[0]->shape(); - num_concats_ = bottom[0]->count(0, concat_axis_); - concat_input_size_ = bottom[0]->count(concat_axis_ + 1); - int bottom_count_sum = bottom[0]->count(); - for (int i = 1; i < bottom.size(); ++i) { - CHECK_EQ(num_axes, bottom[i]->num_axes()) - << "All inputs must have the same #axes."; - for (int j = 0; j < num_axes; ++j) { - if (j == concat_axis_) { continue; } - CHECK_EQ(top_shape[j], bottom[i]->shape(j)) - << "All inputs must have the same shape, except at concat_axis."; - } - bottom_count_sum += bottom[i]->count(); - top_shape[concat_axis_] += bottom[i]->shape(concat_axis_); - } - top[0]->Reshape(top_shape); - CHECK_EQ(bottom_count_sum, top[0]->count()); + const vector*>& top) { + const int num_axes = bottom[0]->num_axes(); + const ConcatParameter& concat_param = this->layer_param_.concat_param(); + if (concat_param.has_concat_dim()) { + concat_axis_ = static_cast(concat_param.concat_dim()); + // Don't allow negative indexing for concat_dim, a uint32 -- almost + // certainly unintended. + CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 " + << "produced negative result; concat_dim must satisfy " + << "0 <= concat_dim < " << kMaxBlobAxes; + CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range."; + } else { + concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis()); + } + // Initialize with the first blob. + vector top_shape = bottom[0]->shape(); + num_concats_ = bottom[0]->count(0, concat_axis_); + concat_input_size_ = bottom[0]->count(concat_axis_ + 1); + int bottom_count_sum = bottom[0]->count(); + for (int i = 1; i < bottom.size(); ++i) { + CHECK_EQ(num_axes, bottom[i]->num_axes()) + << "All inputs must have the same #axes."; + for (int j = 0; j < num_axes; ++j) { + if (j == concat_axis_) { + continue; + } + CHECK_EQ(top_shape[j], bottom[i]->shape(j)) + << "All inputs must have the same shape, except at concat_axis."; + } + bottom_count_sum += bottom[i]->count(); + top_shape[concat_axis_] += bottom[i]->shape(concat_axis_); + } + top[0]->Reshape(top_shape); + CHECK_EQ(bottom_count_sum, top[0]->count()); } -template +template void ConcatLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_cpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - for (int n = 0; n < num_concats_; ++n) { - caffe_copy(bottom_concat_axis * concat_input_size_, - bottom_data + n * bottom_concat_axis * concat_input_size_, - top_data + (n * top_concat_axis + offset_concat_axis) - * concat_input_size_); - } - offset_concat_axis += bottom_concat_axis; - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_cpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + for (int n = 0; n < num_concats_; ++n) { + caffe_copy(bottom_concat_axis * concat_input_size_, + bottom_data + n * bottom_concat_axis * concat_input_size_, + top_data + (n * top_concat_axis + offset_concat_axis) + * concat_input_size_); + } + offset_concat_axis += bottom_concat_axis; + } } -template +template void ConcatLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - for (int n = 0; n < num_concats_; ++n) { - caffe_copy(bottom_concat_axis * concat_input_size_, top_diff + - (n * top_concat_axis + offset_concat_axis) * concat_input_size_, - bottom_diff + n * bottom_concat_axis * concat_input_size_); - } - offset_concat_axis += bottom_concat_axis; - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + for (int i = 0; i < bottom.size(); ++i) { + if (!propagate_down[i]) { + continue; + } + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + for (int n = 0; n < num_concats_; ++n) { + caffe_copy(bottom_concat_axis * concat_input_size_, top_diff + + (n * top_concat_axis + offset_concat_axis) * concat_input_size_, + bottom_diff + n * bottom_concat_axis * concat_input_size_); + } + offset_concat_axis += bottom_concat_axis; + } } -template +template void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - if (bottom.size() == 1) { return; } - Dtype* top_data = top[0]->mutable_gpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = true; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); - offset_concat_axis += bottom_concat_axis; - } + const vector*>& top) { + if (bottom.size() == 1) { + return; + } + Dtype* top_data = top[0]->mutable_gpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = true; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); + offset_concat_axis += bottom_concat_axis; + } } -template +template void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (bottom.size() == 1) { return; } - const Dtype* top_diff = top[0]->gpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = false; - for (int i = 0; i < bottom.size(); ++i) { - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - if (propagate_down[i]) { - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); - } - offset_concat_axis += bottom_concat_axis; - } + const vector& propagate_down, const vector*>& bottom) { + if (bottom.size() == 1) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = false; + for (int i = 0; i < bottom.size(); ++i) { + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + if (propagate_down[i]) { + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + } + offset_concat_axis += bottom_concat_axis; + } } #ifdef CPU_ONLY STUB_GPU(ConcatLayer); #endif -INSTANTIATE_CLASS(ConcatLayer); -REGISTER_LAYER_CLASS(Concat); +INSTANTIATE_CLASS (ConcatLayer); +REGISTER_LAYER_CLASS (Concat); } // namespace caffe diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 4b47eb42..9c3f38d5 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -8,180 +8,180 @@ namespace caffe { -template +template void ContrastiveLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); - CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); - CHECK_EQ(bottom[0]->height(), 1); - CHECK_EQ(bottom[0]->width(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); - CHECK_EQ(bottom[2]->channels(), 1); - CHECK_EQ(bottom[2]->height(), 1); - CHECK_EQ(bottom[2]->width(), 1); - diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); - diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); - dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); - // vector of ones used to sum along channels - summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); - for (int i = 0; i < bottom[0]->channels(); ++i) - summer_vec_.mutable_cpu_data()[i] = Dtype(1); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); + CHECK_EQ(bottom[0]->height(), 1); + CHECK_EQ(bottom[0]->width(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); + CHECK_EQ(bottom[2]->channels(), 1); + CHECK_EQ(bottom[2]->height(), 1); + CHECK_EQ(bottom[2]->width(), 1); + diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); + // vector of ones used to sum along channels + summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); + for (int i = 0; i < bottom[0]->channels(); ++i) + summer_vec_.mutable_cpu_data()[i] = Dtype(1); } -template +template void ContrastiveLossLayer::Forward_cpu( - const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), // a - bottom[1]->cpu_data(), // b - diff_.mutable_cpu_data()); // a_i-b_i - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, - diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels)); - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); - loss += dist*dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& bottom, + const vector*>& top) { + int count = bottom[0]->count(); + caffe_sub( + count, + bottom[0]->cpu_data(), // a + bottom[1]->cpu_data(), // b + diff_.mutable_cpu_data()); // a_i-b_i + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, + diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist * dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } -template +template void ContrastiveLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[i]->num()); - int num = bottom[i]->num(); - int channels = bottom[i]->channels(); - for (int j = 0; j < num; ++j) { - Dtype* bout = bottom[i]->mutable_cpu_diff(); - if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs - caffe_cpu_axpby( - channels, - alpha, - diff_.cpu_data() + (j*channels), - Dtype(0.0), - bout + (j*channels)); - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = margin - dist_sq_.cpu_data()[j]; - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq_.cpu_data()[j]); - mdist = margin - dist; - beta = -alpha * mdist / (dist + Dtype(1e-4)); - } - if (mdist > Dtype(0.0)) { - caffe_cpu_axpby( - channels, - beta, - diff_.cpu_data() + (j*channels), - Dtype(0.0), - bout + (j*channels)); - } else { - caffe_set(channels, Dtype(0), bout + (j*channels)); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / + static_cast(bottom[i]->num()); + int num = bottom[i]->num(); + int channels = bottom[i]->channels(); + for (int j = 0; j < num; ++j) { + Dtype* bout = bottom[i]->mutable_cpu_diff(); + if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs + caffe_cpu_axpby( + channels, + alpha, + diff_.cpu_data() + (j * channels), + Dtype(0.0), + bout + (j * channels)); + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = margin - dist_sq_.cpu_data()[j]; + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq_.cpu_data()[j]); + mdist = margin - dist; + beta = -alpha * mdist / (dist + Dtype(1e-4)); + } + if (mdist > Dtype(0.0)) { + caffe_cpu_axpby( + channels, + beta, + diff_.cpu_data() + (j * channels), + Dtype(0.0), + bout + (j * channels)); + } else { + caffe_set(channels, Dtype(0), bout + (j * channels)); + } + } + } + } + } } -template +template void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); - loss += dist*dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& bottom, const vector*>& top) { + const int count = bottom[0]->count(); + caffe_gpu_sub( + count, + bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i + caffe_gpu_powx( + count, + diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), + diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + caffe_gpu_gemv( + CblasNoTrans, + bottom[0]->num(), + bottom[0]->channels(), + Dtype(1.0), + diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), + Dtype(0.0), + dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist * dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } -template +template void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const int count = bottom[0]->count(); - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); - // NOLINT_NEXT_LINE(whitespace/operators) - CLLBackward(count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const int count = bottom[0]->count(); + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + const bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / + static_cast(bottom[0]->num()); + // NOLINT_NEXT_LINE(whitespace/operators) + CLLBackward(count, channels, margin, legacy_version, alpha, + bottom[2]->gpu_data(), // pair similarity 0 or 1 + diff_.gpu_data(), // the cached eltwise difference between a and b + dist_sq_.gpu_data(), // the cached square distance between a and b + bottom[i]->mutable_gpu_diff()); + } + } } #ifdef CPU_ONLY STUB_GPU(ContrastiveLossLayer); #endif -INSTANTIATE_CLASS(ContrastiveLossLayer); -REGISTER_LAYER_CLASS(ContrastiveLoss); +INSTANTIATE_CLASS (ContrastiveLossLayer); +REGISTER_LAYER_CLASS (ContrastiveLoss); } // namespace caffe diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index c829dbd7..d5ffdb9f 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -7,232 +7,236 @@ namespace caffe { -template +template void ConvolutionLayer::compute_output_shape() { - this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_) - / this->stride_h_ + 1; - this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_) - / this->stride_w_ + 1; + this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_) + / this->stride_h_ + 1; + this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_) + / this->stride_w_ + 1; } -template +template void ConvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->cpu_data(); - this->forward_cpu_bias(top_data + top[i]->offset(n), bias); - } - } - } - - // CHECK_BLOB_DATA(top[0],20, "top[0]"); + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, + top_data + top[i]->offset(n)); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->cpu_data(); + this->forward_cpu_bias(top_data + top[i]->offset(n), bias); + } + } + } + + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template +template void ConvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n), + top_diff + top[i]->offset(n), weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n)); + } + } + } + } } -template +template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - if (use_packing_scheme && global_packing_N >1) - Forward_gpu_opt2(bottom, top); - else - Forward_gpu_org(bottom, top); + const vector*>& top) { + if (use_packing_scheme && global_packing_N > 1) + Forward_gpu_opt2(bottom, top); + else + Forward_gpu_org(bottom, top); } -template +template void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (use_packing_scheme && global_packing_N >1) - Backward_gpu_opt2(top, propagate_down, bottom); - else - Backward_gpu_org(top, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + if (use_packing_scheme && global_packing_N > 1) + Backward_gpu_opt2(top, propagate_down, bottom); + else + Backward_gpu_org(top, propagate_down, bottom); } -template -void ConvolutionLayer::Forward_gpu_opt2(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); - - Dtype* top_data = top[i]->mutable_gpu_data(); - this->opt_num2 = global_packing_N; - this->weight_offset_ = this->M_ * this->K_; - for (int n = 0; n < this->num_; n += this->opt_num2) { - this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; - //intermediate variables to pass offset - this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; - this->top_offset_ = top[i]->offset(n); - this->col_offset_ = this->K_ * this->N_ * this->opt_num2; - this->bottom_offset_ = bottom[i]->offset(n); - this->forward_gpu_gemm_opt(bottom_data, weight, - top_data); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias_opt(top_data, bias); - } - } - } - - //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - //CHECK_BLOB_DATA(top[0],20, "top[0]"); +template +void ConvolutionLayer::Forward_gpu_opt2( + const vector*>& bottom, + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + this->opt_num2 = global_packing_N; + this->weight_offset_ = this->M_ * this->K_; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + //intermediate variables to pass offset + this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_ * this->opt_num2; + this->bottom_offset_ = bottom[i]->offset(n); + this->forward_gpu_gemm_opt(bottom_data, weight, + top_data); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias_opt(top_data, bias); + } + } + } + + //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template -void ConvolutionLayer::Forward_gpu_org(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); - - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - //two intermediate variables to pass offset - this->bottom_offset_ = bottom[i]->offset(n); - this->top_offset_ = top[i]->offset(n); - this->forward_gpu_gemm(bottom_data, weight, - top_data); - - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data, bias); - } - } - } - - // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - //CHECK_BLOB_DATA(top[0],20, "top[0]"); +template +void ConvolutionLayer::Forward_gpu_org( + const vector*>& bottom, + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + //two intermediate variables to pass offset + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->forward_gpu_gemm(bottom_data, weight, + top_data); + + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } + + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template +template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - this->weight_offset_ = this->M_ * this->K_; - this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; n += this->opt_num2) { - this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2; - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm_opt(bottom_data, - top_diff, weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm_opt(top_diff, weight, - bottom_diff); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + this->weight_offset_ = this->M_ * this->K_; + this->opt_num2 = global_packing_N; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm_opt(bottom_data, + top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm_opt(top_diff, weight, + bottom_diff); + } + } + } + } } -template +template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - // - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(bottom_data, - top_diff, weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm(top_diff, weight, - bottom_diff); - } - } - } - } - + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + // + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(bottom_data, + top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm(top_diff, weight, + bottom_diff); + } + } + } + } + } #ifdef CPU_ONLY STUB_GPU(ConvolutionLayer); #endif -INSTANTIATE_CLASS(ConvolutionLayer); +INSTANTIATE_CLASS (ConvolutionLayer); } // namespace caffe diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 8ac9b8ee..bff8b10c 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -16,113 +16,113 @@ namespace caffe { -template +template DataLayer::~DataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } -template +template void DataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - // Initialize DB - db_.reset(db::GetDB(this->layer_param_.data_param().backend())); - db_->Open(this->layer_param_.data_param().source(), db::READ); - cursor_.reset(db_->NewCursor()); + const vector*>& top) { + // Initialize DB + db_.reset(db::GetDB(this->layer_param_.data_param().backend())); + db_->Open(this->layer_param_.data_param().source(), db::READ); + cursor_.reset(db_->NewCursor()); - // Check if we should randomly skip a few data points - if (this->layer_param_.data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.data_param().rand_skip(); - LOG(INFO) << "Skipping first " << skip << " data points."; - while (skip-- > 0) { - cursor_->Next(); - } - } - // Read a data point, to initialize the prefetch and top blobs. - Datum datum; - datum.ParseFromString(cursor_->value()); - // Use data_transformer to infer the expected blob shape from datum. - vector top_shape = this->data_transformer_->InferBlobShape(datum); - this->transformed_data_.Reshape(top_shape); - // Reshape top[0] and prefetch_data according to the batch_size. - top_shape[0] = this->layer_param_.data_param().batch_size(); - this->prefetch_data_.Reshape(top_shape); - top[0]->ReshapeLike(this->prefetch_data_); - this->prefetch_data_.set_data_layer(); + // Check if we should randomly skip a few data points + if (this->layer_param_.data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() % + this->layer_param_.data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + while (skip-- > 0) { + cursor_->Next(); + } + } + // Read a data point, to initialize the prefetch and top blobs. + Datum datum; + datum.ParseFromString(cursor_->value()); + // Use data_transformer to infer the expected blob shape from datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape top[0] and prefetch_data according to the batch_size. + top_shape[0] = this->layer_param_.data_param().batch_size(); + this->prefetch_data_.Reshape(top_shape); + top[0]->ReshapeLike(this->prefetch_data_); + this->prefetch_data_.set_data_layer(); - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - if (this->output_labels_) { - vector label_shape(1, this->layer_param_.data_param().batch_size()); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); - this->prefetch_label_.set_data_layer(); - } + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + if (this->output_labels_) { + vector label_shape(1, this->layer_param_.data_param().batch_size()); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); + this->prefetch_label_.set_data_layer(); + } } // This function is used to create a thread that prefetches the data. -template +template void DataLayer::InternalThreadEntry() { - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - CHECK(this->prefetch_data_.count()); - CHECK(this->transformed_data_.count()); + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + CHECK(this->prefetch_data_.count()); + CHECK(this->transformed_data_.count()); - // Reshape according to the first datum of each batch - // on single input batches allows for inputs of varying dimension. - const int batch_size = this->layer_param_.data_param().batch_size(); - Datum datum; - datum.ParseFromString(cursor_->value()); - // Use data_transformer to infer the expected blob shape from datum. - vector top_shape = this->data_transformer_->InferBlobShape(datum); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data according to the batch_size. - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); + // Reshape according to the first datum of each batch + // on single input batches allows for inputs of varying dimension. + const int batch_size = this->layer_param_.data_param().batch_size(); + Datum datum; + datum.ParseFromString(cursor_->value()); + // Use data_transformer to infer the expected blob shape from datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data according to the batch_size. + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); - Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* top_label = NULL; // suppress warnings about uninitialized variables + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* top_label = NULL; // suppress warnings about uninitialized variables - if (this->output_labels_) { - top_label = this->prefetch_label_.mutable_cpu_data(); - } - timer.Start(); - for (int item_id = 0; item_id < batch_size; ++item_id) { - // get a datum - Datum datum; - datum.ParseFromString(cursor_->value()); - read_time += timer.MicroSeconds(); - timer.Start(); - // Apply data transformations (mirror, scale, crop...) - int offset = this->prefetch_data_.offset(item_id); - this->transformed_data_.set_cpu_data(top_data + offset); - this->data_transformer_->Transform(datum, &(this->transformed_data_)); - // Copy label. - if (this->output_labels_) { - top_label[item_id] = datum.label(); - } - trans_time += timer.MicroSeconds(); - timer.Start(); - // go to the next item. - cursor_->Next(); - if (!cursor_->valid()) { - DLOG(INFO) << "Restarting data prefetching from start."; - cursor_->SeekToFirst(); - } - } - timer.Stop(); - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + if (this->output_labels_) { + top_label = this->prefetch_label_.mutable_cpu_data(); + } + timer.Start(); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a datum + Datum datum; + datum.ParseFromString(cursor_->value()); + read_time += timer.MicroSeconds(); + timer.Start(); + // Apply data transformations (mirror, scale, crop...) + int offset = this->prefetch_data_.offset(item_id); + this->transformed_data_.set_cpu_data(top_data + offset); + this->data_transformer_->Transform(datum, &(this->transformed_data_)); + // Copy label. + if (this->output_labels_) { + top_label[item_id] = datum.label(); + } + trans_time += timer.MicroSeconds(); + timer.Start(); + // go to the next item. + cursor_->Next(); + if (!cursor_->valid()) { + DLOG(INFO) << "Restarting data prefetching from start."; + cursor_->SeekToFirst(); + } + } + timer.Stop(); + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(DataLayer); -REGISTER_LAYER_CLASS(Data); +INSTANTIATE_CLASS (DataLayer); +REGISTER_LAYER_CLASS (Data); } // namespace caffe diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index 4b952c73..aa61a755 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -8,129 +8,128 @@ namespace caffe { -template +template void DeconvolutionLayer::compute_output_shape() { - this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_ - - 2 * this->pad_h_; - this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_ - - 2 * this->pad_w_; + this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_ + - 2 * this->pad_h_; + this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_ + - 2 * this->pad_w_; } -template +template void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->cpu_data(); - this->forward_cpu_bias(top_data + top[i]->offset(n), bias); - } - } - } + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, + top_data + top[i]->offset(n)); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->cpu_data(); + this->forward_cpu_bias(top_data + top[i]->offset(n), bias); + } + } + } } -template +template void DeconvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // Gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_cpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // Gradient w.r.t. bottom data, if necessary, reusing the column buffer - // we might have just computed above. - if (propagate_down[i]) { - this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n), - this->param_propagate_down_[0]); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + // Gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_cpu_gemm(top_diff + top[i]->offset(n), + bottom_data + bottom[i]->offset(n), weight_diff); + } + // Gradient w.r.t. bottom data, if necessary, reusing the column buffer + // we might have just computed above. + if (propagate_down[i]) { + this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n), + this->param_propagate_down_[0]); + } + } + } + } } -template +template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->bottom_offset_ = bottom[i]->offset(n); - this->top_offset_ = top[i]->offset(n); - this->backward_gpu_gemm(bottom_data, weight, top_data); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data, bias); - } - } - } + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_gemm(bottom_data, weight, top_data); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } } -template +template void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(top_diff + top[i]->offset(n), + bottom_data + bottom[i]->offset(n), weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n)); + } + } + } + } } - #ifdef CPU_ONLY STUB_GPU(DeconvolutionLayer); #endif -INSTANTIATE_CLASS(DeconvolutionLayer); -REGISTER_LAYER_CLASS(Deconvolution); +INSTANTIATE_CLASS (DeconvolutionLayer); +REGISTER_LAYER_CLASS (Deconvolution); } // namespace caffe diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 4175a2b7..ae045c5c 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -10,122 +10,122 @@ namespace caffe { -template -void DropoutLayer::ocl_setup(int bottom_count){ - MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL); +template +void DropoutLayer::ocl_setup(int bottom_count) { + MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + bottom_count * sizeof(int), NULL, NULL); } -template -DropoutLayer::~DropoutLayer(){ - OCL_CHECK( clReleaseMemObject(MaskMem) ); -} - - -template +template +DropoutLayer::~DropoutLayer() { + OCL_CHECK (clReleaseMemObject(MaskMem) ); + } +template void DropoutLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - threshold_ = this->layer_param_.dropout_param().dropout_ratio(); - DCHECK(threshold_ > 0.); - DCHECK(threshold_ < 1.); - scale_ = 1. / (1. - threshold_); - uint_thres_ = static_cast(UINT_MAX * threshold_); - ocl_setup(bottom[0]->count()); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.dropout_param().dropout_ratio(); + DCHECK(threshold_ > 0.); + DCHECK(threshold_ < 1.); + scale_ = 1. / (1. - threshold_); + uint_thres_ = static_cast(UINT_MAX * threshold_); + ocl_setup(bottom[0]->count()); } -template +template void DropoutLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::Reshape(bottom, top); - // Set up the cache for random number generation - rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + const vector*>& top) { + NeuronLayer < Dtype > ::Reshape(bottom, top); + // Set up the cache for random number generation + rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); } -template +template void DropoutLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - unsigned int* mask = rand_vec_.mutable_cpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - // Create random numbers - caffe_rng_bernoulli(count, 1. - threshold_, mask); - for (int i = 0; i < count; ++i) { - top_data[i] = bottom_data[i] * mask[i] * scale_; - } - } else { - caffe_copy(bottom[0]->count(), bottom_data, top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + unsigned int* mask = rand_vec_.mutable_cpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + // Create random numbers + caffe_rng_bernoulli(count, 1. - threshold_, mask); + for (int i = 0; i < count; ++i) { + top_data[i] = bottom_data[i] * mask[i] * scale_; + } + } else { + caffe_copy(bottom[0]->count(), bottom_data, top_data); + } } -template +template void DropoutLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - if (this->phase_ == TRAIN) { - const unsigned int* mask = rand_vec_.cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - bottom_diff[i] = top_diff[i] * mask[i] * scale_; - } - } else { - caffe_copy(top[0]->count(), top_diff, bottom_diff); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + if (this->phase_ == TRAIN) { + const unsigned int* mask = rand_vec_.cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + bottom_diff[i] = top_diff[i] * mask[i] * scale_; + } + } else { + caffe_copy(top[0]->count(), top_diff, bottom_diff); + } + } } -template +template void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - //unsigned int* mask = - // static_cast(rand_vec_.mutable_gpu_data()); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + //unsigned int* mask = + // static_cast(rand_vec_.mutable_gpu_data()); #ifdef use_cpu_generator_dropout - unsigned int* mask_cpu = - static_cast(rand_vec_.mutable_cpu_data()); - caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); - OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); - DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + unsigned int* mask_cpu = + static_cast(rand_vec_.mutable_cpu_data()); + caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); + OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); + DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else - caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_); - DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1., + threshold_); + DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_, + top_data); #endif - } else { - caffe_gpu_copy(count, bottom_data, top_data); - } + } else { + caffe_gpu_copy(count, bottom_data, top_data); + } } - -template +template void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const int count = bottom[0]->count(); - DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff); - } else { - caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (this->phase_ == TRAIN) { + const int count = bottom[0]->count(); + DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_, + (Dtype) scale_, bottom_diff); + } else { + caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); + } + } } - #ifdef CPU_ONLY STUB_GPU(DropoutLayer); #endif -INSTANTIATE_CLASS(DropoutLayer); -REGISTER_LAYER_CLASS(Dropout); +INSTANTIATE_CLASS (DropoutLayer); +REGISTER_LAYER_CLASS (Dropout); } // namespace caffe diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index 6b0d6174..8a3fe17e 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -6,110 +6,110 @@ namespace caffe { -template +template void DummyDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int num_top = top.size(); - const DummyDataParameter& param = this->layer_param_.dummy_data_param(); - const int num_data_filler = param.data_filler_size(); - CHECK(num_data_filler == 0 || num_data_filler == 1 || - num_data_filler == num_top) - << "Number of data fillers must be 0, 1 or equal to the number of tops: " - << num_top << "; you specified " << num_data_filler << " data fillers."; + const vector*>& top) { + const int num_top = top.size(); + const DummyDataParameter& param = this->layer_param_.dummy_data_param(); + const int num_data_filler = param.data_filler_size(); + CHECK(num_data_filler == 0 || num_data_filler == 1 || + num_data_filler == num_top) + << "Number of data fillers must be 0, 1 or equal to the number of tops: " + << num_top << "; you specified " << num_data_filler << " data fillers."; - const bool legacy_dims = param.num_size() || param.channels_size() || - param.height_size() || param.width_size(); - if (legacy_dims) { - CHECK_EQ(0, param.shape_size()) - << "Both shape and legacy fields were specified"; - // Using deprecated 4D output dim specifiers. - CHECK(param.num_size() == 1 || param.num_size() == num_top) - << "Must specify 'num' once, or once per top blob " - << "(" << num_top << "); specified " << param.num_size() << "."; - CHECK(param.channels_size() == 1 || param.channels_size() == num_top) - << "Must specify 'channels' once, or once per top blob " - << "(" << num_top << "); specified " << param.channels_size() << "."; - CHECK(param.height_size() == 1 || param.height_size() == num_top) - << "Must specify 'height' once, or once per top blob " - << "(" << num_top << "); specified " << param.height_size() << "."; - CHECK(param.width_size() == 1 || param.width_size() == num_top) - << "Must specify 'width' once, or once per top blob " - << "(" << num_top << "); specified " << param.width_size() << "."; - } else { - CHECK(param.shape_size() == 1 || param.shape_size() == num_top) - << "Must specify 'shape' once, or once per top blob " - << "(" << num_top << "); specified " << param.shape_size() << "."; - } - // refill_[i] tells Forward i whether or not to actually refill top Blob i. - // If refill_[i] is false, Forward does nothing for Blob i. We use this to - // avoid wastefully refilling "constant" Blobs in every forward pass. - // We first fill refill_ in with the INVERSE of its final values. - // The first time we run Forward from the LayerSetUp method, we'll fill only - // Blobs for which refill_ is normally false. These Blobs will never be - // filled again. - refill_.clear(); - fillers_.clear(); - if (num_data_filler <= 1) { - FillerParameter filler_param; - if (num_data_filler == 0) { - filler_param.set_type("constant"); - filler_param.set_value(0); - } else { - filler_param.CopyFrom(param.data_filler(0)); - } - // Refill on each iteration iff not using a constant filler, - // but use the inverse of this rule for the first run. - refill_.resize(1); - refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0); - fillers_.resize(1); - fillers_[0].reset(GetFiller(filler_param)); - } else { - refill_.resize(num_top); - fillers_.resize(num_top); - for (int i = 0; i < num_top; ++i) { - fillers_[i].reset(GetFiller(param.data_filler(i))); - // Refill on each iteration iff not using a constant filler, - // but use the inverse of this rule for the first run. - refill_[i] = - (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); - } - } - for (int i = 0; i < num_top; ++i) { - if (legacy_dims) { - const int num = (param.num_size() == 1) ? param.num(0) : param.num(i); - const int channels = - (param.channels_size() == 1) ? param.channels(0) : param.channels(i); - const int height = - (param.height_size() == 1) ? param.height(0) : param.height(i); - const int width = - (param.width_size() == 1) ? param.width(0) : param.width(i); - top[i]->Reshape(num, channels, height, width); - } else { - const int shape_index = (param.shape_size() == 1) ? 0 : i; - top[i]->Reshape(param.shape(shape_index)); - } - } - // Run Forward once, with refill_ inverted, to fill the constant Blobs. - this->Forward(bottom, top); - // Invert the inverted refill_ values to refill the desired (non-constant) - // Blobs in every usual forward pass. - for (int i = 0; i < refill_.size(); ++i) { - refill_[i] = !refill_[i]; - } + const bool legacy_dims = param.num_size() || param.channels_size() || + param.height_size() || param.width_size(); + if (legacy_dims) { + CHECK_EQ(0, param.shape_size()) + << "Both shape and legacy fields were specified"; + // Using deprecated 4D output dim specifiers. + CHECK(param.num_size() == 1 || param.num_size() == num_top) + << "Must specify 'num' once, or once per top blob " + << "(" << num_top << "); specified " << param.num_size() << "."; + CHECK(param.channels_size() == 1 || param.channels_size() == num_top) + << "Must specify 'channels' once, or once per top blob " + << "(" << num_top << "); specified " << param.channels_size() << "."; + CHECK(param.height_size() == 1 || param.height_size() == num_top) + << "Must specify 'height' once, or once per top blob " + << "(" << num_top << "); specified " << param.height_size() << "."; + CHECK(param.width_size() == 1 || param.width_size() == num_top) + << "Must specify 'width' once, or once per top blob " + << "(" << num_top << "); specified " << param.width_size() << "."; + } else { + CHECK(param.shape_size() == 1 || param.shape_size() == num_top) + << "Must specify 'shape' once, or once per top blob " + << "(" << num_top << "); specified " << param.shape_size() << "."; + } + // refill_[i] tells Forward i whether or not to actually refill top Blob i. + // If refill_[i] is false, Forward does nothing for Blob i. We use this to + // avoid wastefully refilling "constant" Blobs in every forward pass. + // We first fill refill_ in with the INVERSE of its final values. + // The first time we run Forward from the LayerSetUp method, we'll fill only + // Blobs for which refill_ is normally false. These Blobs will never be + // filled again. + refill_.clear(); + fillers_.clear(); + if (num_data_filler <= 1) { + FillerParameter filler_param; + if (num_data_filler == 0) { + filler_param.set_type("constant"); + filler_param.set_value(0); + } else { + filler_param.CopyFrom(param.data_filler(0)); + } + // Refill on each iteration iff not using a constant filler, + // but use the inverse of this rule for the first run. + refill_.resize(1); + refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0); + fillers_.resize(1); + fillers_[0].reset(GetFiller < Dtype > (filler_param)); + } else { + refill_.resize(num_top); + fillers_.resize(num_top); + for (int i = 0; i < num_top; ++i) { + fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i))); + // Refill on each iteration iff not using a constant filler, + // but use the inverse of this rule for the first run. + refill_[i] = + (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); + } + } + for (int i = 0; i < num_top; ++i) { + if (legacy_dims) { + const int num = (param.num_size() == 1) ? param.num(0) : param.num(i); + const int channels = + (param.channels_size() == 1) ? param.channels(0) : param.channels(i); + const int height = + (param.height_size() == 1) ? param.height(0) : param.height(i); + const int width = + (param.width_size() == 1) ? param.width(0) : param.width(i); + top[i]->Reshape(num, channels, height, width); + } else { + const int shape_index = (param.shape_size() == 1) ? 0 : i; + top[i]->Reshape(param.shape(shape_index)); + } + } + // Run Forward once, with refill_ inverted, to fill the constant Blobs. + this->Forward(bottom, top); + // Invert the inverted refill_ values to refill the desired (non-constant) + // Blobs in every usual forward pass. + for (int i = 0; i < refill_.size(); ++i) { + refill_[i] = !refill_[i]; + } } -template +template void DummyDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - const int filler_id = (fillers_.size() > 1) ? i : 0; - if (refill_[filler_id]) { - fillers_[filler_id]->Fill(top[i]); - } - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + const int filler_id = (fillers_.size() > 1) ? i : 0; + if (refill_[filler_id]) { + fillers_[filler_id]->Fill(top[i]); + } + } } -INSTANTIATE_CLASS(DummyDataLayer); -REGISTER_LAYER_CLASS(DummyData); +INSTANTIATE_CLASS (DummyDataLayer); +REGISTER_LAYER_CLASS (DummyData); } // namespace caffe diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 5a7e5e74..45126d44 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -7,239 +7,244 @@ namespace caffe { -template +template void EltwiseLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK(this->layer_param().eltwise_param().coeff_size() == 0 - || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << - "Eltwise Layer takes one coefficient per bottom blob."; - CHECK(!(this->layer_param().eltwise_param().operation() - == EltwiseParameter_EltwiseOp_PROD - && this->layer_param().eltwise_param().coeff_size())) << - "Eltwise layer only takes coefficients for summation."; - op_ = this->layer_param_.eltwise_param().operation(); - // Blob-wise coefficients for the elementwise operation. - coeffs_ = vector(bottom.size(), 1); - if (this->layer_param().eltwise_param().coeff_size()) { - for (int i = 0; i < bottom.size(); ++i) { - coeffs_[i] = this->layer_param().eltwise_param().coeff(i); - } - } - stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad(); + const vector*>& top) { + CHECK(this->layer_param().eltwise_param().coeff_size() == 0 + || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << + "Eltwise Layer takes one coefficient per bottom blob."; + CHECK(!(this->layer_param().eltwise_param().operation() + == EltwiseParameter_EltwiseOp_PROD + && this->layer_param().eltwise_param().coeff_size())) << + "Eltwise layer only takes coefficients for summation."; + op_ = this->layer_param_.eltwise_param().operation(); + // Blob-wise coefficients for the elementwise operation. + coeffs_ = vector < Dtype > (bottom.size(), 1); + if (this->layer_param().eltwise_param().coeff_size()) { + for (int i = 0; i < bottom.size(); ++i) { + coeffs_[i] = this->layer_param().eltwise_param().coeff(i); + } + } + stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad(); } -template +template void EltwiseLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - for (int i = 1; i < bottom.size(); ++i) { - CHECK(bottom[i]->shape() == bottom[0]->shape()); - } - top[0]->ReshapeLike(*bottom[0]); - // If max operation, we will initialize the vector index part. - if (this->layer_param_.eltwise_param().operation() == - EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { - max_idx_.Reshape(bottom[0]->shape()); - } + const vector*>& top) { + for (int i = 1; i < bottom.size(); ++i) { + CHECK(bottom[i]->shape() == bottom[0]->shape()); + } + top[0]->ReshapeLike(*bottom[0]); + // If max operation, we will initialize the vector index part. + if (this->layer_param_.eltwise_param().operation() == + EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { + max_idx_.Reshape(bottom[0]->shape()); + } } -template +template void EltwiseLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - int* mask = NULL; - const Dtype* bottom_data_a = NULL; - const Dtype* bottom_data_b = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_cpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_set(count, Dtype(0), top_data); - // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - // Initialize - mask = max_idx_.mutable_cpu_data(); - caffe_set(count, -1, mask); - caffe_set(count, Dtype(-FLT_MAX), top_data); - // bottom 0 & 1 - bottom_data_a = bottom[0]->cpu_data(); - bottom_data_b = bottom[1]->cpu_data(); - for (int idx = 0; idx < count; ++idx) { - if (bottom_data_a[idx] > bottom_data_b[idx]) { - top_data[idx] = bottom_data_a[idx]; // maxval - mask[idx] = 0; // maxid - } else { - top_data[idx] = bottom_data_b[idx]; // maxval - mask[idx] = 1; // maxid - } - } - // bottom 2++ - for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) { - bottom_data_b = bottom[blob_idx]->cpu_data(); - for (int idx = 0; idx < count; ++idx) { - if (bottom_data_b[idx] > top_data[idx]) { - top_data[idx] = bottom_data_b[idx]; // maxval - mask[idx] = blob_idx; // maxid - } - } - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } + const vector*>& bottom, const vector*>& top) { + int* mask = NULL; + const Dtype* bottom_data_a = NULL; + const Dtype* bottom_data_b = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_cpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_set(count, Dtype(0), top_data); + // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + // Initialize + mask = max_idx_.mutable_cpu_data(); + caffe_set(count, -1, mask); + caffe_set(count, Dtype(-FLT_MAX), top_data); + // bottom 0 & 1 + bottom_data_a = bottom[0]->cpu_data(); + bottom_data_b = bottom[1]->cpu_data(); + for (int idx = 0; idx < count; ++idx) { + if (bottom_data_a[idx] > bottom_data_b[idx]) { + top_data[idx] = bottom_data_a[idx]; // maxval + mask[idx] = 0; // maxid + } else { + top_data[idx] = bottom_data_b[idx]; // maxval + mask[idx] = 1; // maxid + } + } + // bottom 2++ + for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) { + bottom_data_b = bottom[blob_idx]->cpu_data(); + for (int idx = 0; idx < count; ++idx) { + if (bottom_data_b[idx] > top_data[idx]) { + top_data[idx] = bottom_data_b[idx]; // maxval + mask[idx] = blob_idx; // maxid + } + } + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } } -template +template void EltwiseLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } - if (!initialized) { - caffe_copy(count, bottom[j]->cpu_data(), bottom_diff); - initialized = true; - } else { - caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_div(count, top_data, bottom_data, bottom_diff); - } - caffe_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.cpu_data(); - for (int index = 0; index < count; ++index) { - Dtype gradient = 0; - if (mask[index] == i) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { + continue; + } + if (!initialized) { + caffe_copy(count, bottom[j]->cpu_data(), bottom_diff); + initialized = true; + } else { + caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, + bottom_diff); + } + } + } else { + caffe_div(count, top_data, bottom_data, bottom_diff); + } + caffe_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1)) { + caffe_copy(count, top_diff, bottom_diff); + } else { + caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.cpu_data(); + for (int index = 0; index < count; ++index) { + Dtype gradient = 0; + if (mask[index] == i) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } } -template +template void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int* mask = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.mutable_gpu_data(); - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); - for (int i = 2; i < bottom.size(); ++i) { - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } + const vector*>& top) { + int* mask = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_gpu_set(count, Dtype(0.), top_data); + // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.mutable_gpu_data(); + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, + top_data, mask); + for (int i = 2; i < bottom.size(); ++i) { + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, + mask); + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } } -template +template void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } - if (!initialized) { - caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); - initialized = true; - } else { - caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - } - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_gpu_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.gpu_data(); - MaxBackward(count, top_diff, i, mask, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { + continue; + } + if (!initialized) { + caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); + initialized = true; + } else { + caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, + bottom_diff); + } + } + } else { + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + } + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1.)) { + caffe_gpu_copy(count, top_diff, bottom_diff); + } else { + caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.gpu_data(); + MaxBackward(count, top_diff, i, mask, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } } - #ifdef CPU_ONLY STUB_GPU(EltwiseLayer); #endif -INSTANTIATE_CLASS(EltwiseLayer); -REGISTER_LAYER_CLASS(Eltwise); +INSTANTIATE_CLASS (EltwiseLayer); +REGISTER_LAYER_CLASS (Eltwise); } // namespace caffe diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index d1efe5bb..d5abc23f 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -7,83 +7,83 @@ namespace caffe { -template +template void EuclideanLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) - << "Inputs must have the same dimension."; - diff_.ReshapeLike(*bottom[0]); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) + << "Inputs must have the same dimension."; + diff_.ReshapeLike(*bottom[0]); } -template +template void EuclideanLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), - diff_.mutable_cpu_data()); - Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& top) { + int count = bottom[0]->count(); + caffe_sub( + count, + bottom[0]->cpu_data(), + bottom[1]->cpu_data(), + diff_.mutable_cpu_data()); + Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } -template +template void EuclideanLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_cpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.cpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_cpu_diff()); // b - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_cpu_axpby( + bottom[i]->count(), // count + alpha, // alpha + diff_.cpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_cpu_diff()); // b + } + } } -template +template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& top) { + int count = bottom[0]->count(); + caffe_gpu_sub( + count, + bottom[0]->gpu_data(), + bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); + Dtype dot; + caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } -template +template void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_gpu_axpby( + bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b + } + } } #ifdef CPU_ONLY STUB_GPU(EuclideanLossLayer); #endif -INSTANTIATE_CLASS(EuclideanLossLayer); -REGISTER_LAYER_CLASS(EuclideanLoss); +INSTANTIATE_CLASS (EuclideanLossLayer); +REGISTER_LAYER_CLASS (EuclideanLoss); } // namespace caffe diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 5e7819c0..8451b133 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -7,94 +7,98 @@ namespace caffe { -template +template void ExpLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - const Dtype base = this->layer_param_.exp_param().base(); - if (base != Dtype(-1)) { - CHECK_GT(base, 0) << "base must be strictly positive."; - } - // If base == -1, interpret the base as e and set log_base = 1 exactly. - // Otherwise, calculate its log explicitly. - const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; - const Dtype input_scale = this->layer_param_.exp_param().scale(); - const Dtype input_shift = this->layer_param_.exp_param().shift(); - inner_scale_ = log_base * input_scale; - outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + const Dtype base = this->layer_param_.exp_param().base(); + if (base != Dtype(-1)) { + CHECK_GT(base, 0) << "base must be strictly positive."; + } + // If base == -1, interpret the base as e and set log_base = 1 exactly. + // Otherwise, calculate its log explicitly. + const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); + CHECK(!isnan(log_base)) + << "NaN result: log(base) = log(" << base << ") = " << log_base; + CHECK(!isinf(log_base)) + << "Inf result: log(base) = log(" << base << ") = " << log_base; + const Dtype input_scale = this->layer_param_.exp_param().scale(); + const Dtype input_shift = this->layer_param_.exp_param().shift(); + inner_scale_ = log_base * input_scale; + outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift); } -template +template void ExpLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_exp(count, bottom_data, top_data); - } else { - caffe_cpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_scal(count, outer_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_exp(count, bottom_data, top_data); + } else { + caffe_cpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_scal(count, outer_scale_, top_data); + } } -template +template void ExpLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_scal(count, inner_scale_, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_scal(count, inner_scale_, bottom_diff); + } } -template +template void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_gpu_exp(count, bottom_data, top_data); + } else { + caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_gpu_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_gpu_scal(count, outer_scale_, top_data); + } } -template +template void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_gpu_scal(count, inner_scale_, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU(ExpLayer); #endif -INSTANTIATE_CLASS(ExpLayer); -REGISTER_LAYER_CLASS(Exp); +INSTANTIATE_CLASS (ExpLayer); +REGISTER_LAYER_CLASS (Exp); } // namespace caffe diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index c5f5e4dd..9fa26c80 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -7,180 +7,180 @@ namespace caffe { -template +template void FilterLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(top.size(), bottom.size() - 1); - first_reshape_ = true; + const vector*>& top) { + CHECK_EQ(top.size(), bottom.size() - 1); + first_reshape_ = true; } -template +template void FilterLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - // bottom[0...k-1] are the blobs to filter - // bottom[last] is the "selector_blob" - int selector_index = bottom.size() - 1; - for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) { - CHECK_EQ(bottom[selector_index]->shape(i), 1) - << "Selector blob dimensions must be singletons (1), except the first"; - } - for (int i = 0; i < bottom.size() - 1; ++i) { - CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) << - "Each bottom should have the same 0th dimension as the selector blob"; - } + const vector*>& top) { + // bottom[0...k-1] are the blobs to filter + // bottom[last] is the "selector_blob" + int selector_index = bottom.size() - 1; + for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) { + CHECK_EQ(bottom[selector_index]->shape(i), 1) + << "Selector blob dimensions must be singletons (1), except the first"; + } + for (int i = 0; i < bottom.size() - 1; ++i) { + CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) << + "Each bottom should have the same 0th dimension as the selector blob"; + } - const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); - indices_to_forward_.clear(); + const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); + indices_to_forward_.clear(); - // look for non-zero elements in bottom[0]. Items of each bottom that - // have the same index as the items in bottom[0] with value == non-zero - // will be forwarded - for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) { - // we don't need an offset because item size == 1 - const Dtype* tmp_data_selector = bottom_data_selector + item_id; - if (*tmp_data_selector) { - indices_to_forward_.push_back(item_id); - } - } - // only filtered items will be forwarded - int new_tops_num = indices_to_forward_.size(); - // init - if (first_reshape_) { - new_tops_num = bottom[0]->shape(0); - first_reshape_ = false; - } - for (int t = 0; t < top.size(); ++t) { - int num_axes = bottom[t]->num_axes(); - vector shape_top(num_axes); - shape_top[0] = new_tops_num; - for (int ts = 1; ts < num_axes; ++ts) - shape_top[ts] = bottom[t]->shape(ts); - top[t]->Reshape(shape_top); - } + // look for non-zero elements in bottom[0]. Items of each bottom that + // have the same index as the items in bottom[0] with value == non-zero + // will be forwarded + for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) { + // we don't need an offset because item size == 1 + const Dtype* tmp_data_selector = bottom_data_selector + item_id; + if (*tmp_data_selector) { + indices_to_forward_.push_back(item_id); + } + } + // only filtered items will be forwarded + int new_tops_num = indices_to_forward_.size(); + // init + if (first_reshape_) { + new_tops_num = bottom[0]->shape(0); + first_reshape_ = false; + } + for (int t = 0; t < top.size(); ++t) { + int num_axes = bottom[t]->num_axes(); + vector shape_top(num_axes); + shape_top[0] = new_tops_num; + for (int ts = 1; ts < num_axes; ++ts) + shape_top[ts] = bottom[t]->shape(ts); + top[t]->Reshape(shape_top); + } } -template +template void FilterLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->cpu_data(); - Dtype* top_data = top[t]->mutable_cpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1); - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->cpu_data(); + Dtype* top_data = top[t]->mutable_cpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1); + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } } -template +template void FilterLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); i++) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); n++) { - data_offset_bottom = n * dim; - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - if (n != batch_offset) { // this data was not been forwarded - caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - next_to_backward_offset++; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top, - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); i++) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); n++) { + data_offset_bottom = n * dim; + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + caffe_set(dim, Dtype(0), + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + if (n != batch_offset) { // this data was not been forwarded + caffe_set(dim, Dtype(0), + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + next_to_backward_offset++; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top, + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } + } + } + } + } } -template +template void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->gpu_data(); - Dtype* top_data = top[t]->mutable_gpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * dim; - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->gpu_data(); + Dtype* top_data = top[t]->mutable_gpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * dim; + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } } -template +template void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); ++i) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); ++i) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); ++n) { + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + data_offset_bottom = n * dim; + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + data_offset_bottom = n * dim; + if (n != batch_offset) { // this data was not been forwarded + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + ++next_to_backward_offset; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } + } + } + } + } } #ifdef CPU_ONLY STUB_GPU(FilterLayer); #endif -INSTANTIATE_CLASS(FilterLayer); -REGISTER_LAYER_CLASS(Filter); +INSTANTIATE_CLASS (FilterLayer); +REGISTER_LAYER_CLASS (Filter); } // namespace caffe diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index f7e5c9c2..4aaad3a4 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -6,39 +6,39 @@ namespace caffe { -template +template void FlattenLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int start_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().axis()); - const int end_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().end_axis()); - vector top_shape; - for (int i = 0; i < start_axis; ++i) { - top_shape.push_back(bottom[0]->shape(i)); - } - const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1); - top_shape.push_back(flattened_dim); - for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) { - top_shape.push_back(bottom[0]->shape(i)); - } - top[0]->Reshape(top_shape); - CHECK_EQ(top[0]->count(), bottom[0]->count()); + const vector*>& top) { + const int start_axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.flatten_param().axis()); + const int end_axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.flatten_param().end_axis()); + vector top_shape; + for (int i = 0; i < start_axis; ++i) { + top_shape.push_back(bottom[0]->shape(i)); + } + const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1); + top_shape.push_back(flattened_dim); + for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) { + top_shape.push_back(bottom[0]->shape(i)); + } + top[0]->Reshape(top_shape); + CHECK_EQ(top[0]->count(), bottom[0]->count()); } -template +template void FlattenLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - top[0]->ShareData(*bottom[0]); + const vector*>& top) { + top[0]->ShareData(*bottom[0]); } -template +template void FlattenLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - bottom[0]->ShareDiff(*top[0]); + const vector& propagate_down, const vector*>& bottom) { + bottom[0]->ShareDiff(*top[0]); } -INSTANTIATE_CLASS(FlattenLayer); -REGISTER_LAYER_CLASS(Flatten); +INSTANTIATE_CLASS (FlattenLayer); +REGISTER_LAYER_CLASS (Flatten); } // namespace caffe diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index af223c0f..377755b9 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -1,11 +1,11 @@ /* -TODO: -- load file in a separate thread ("prefetch") -- can be smarter about the memcpy call instead of doing it row-by-row - :: use util functions caffe_copy, and Blob->offset() - :: don't forget to update hdf5_daa_layer.cu accordingly -- add ability to shuffle filenames if flag is set -*/ + TODO: + - load file in a separate thread ("prefetch") + - can be smarter about the memcpy call instead of doing it row-by-row + :: use util functions caffe_copy, and Blob->offset() + :: don't forget to update hdf5_daa_layer.cu accordingly + - add ability to shuffle filenames if flag is set + */ #include // NOLINT(readability/streams) #include #include @@ -20,182 +20,187 @@ namespace caffe { -template -HDF5DataLayer::~HDF5DataLayer() { } +template +HDF5DataLayer::~HDF5DataLayer() { +} // Load data and label from HDF5 filename into the class property blobs. -template +template void HDF5DataLayer::LoadHDF5FileData(const char* filename) { - DLOG(INFO) << "Loading HDF5 file: " << filename; - hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT); - if (file_id < 0) { - LOG(FATAL) << "Failed opening HDF5 file: " << filename; - } - - int top_size = this->layer_param_.top_size(); - hdf_blobs_.resize(top_size); - - const int MIN_DATA_DIM = 1; - const int MAX_DATA_DIM = INT_MAX; - - for (int i = 0; i < top_size; ++i) { - hdf_blobs_[i] = shared_ptr >(new Blob()); - hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), - MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); - } - - herr_t status = H5Fclose(file_id); - CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename; - - // MinTopBlobs==1 guarantees at least one top blob - CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis."; - const int num = hdf_blobs_[0]->shape(0); - for (int i = 1; i < top_size; ++i) { - CHECK_EQ(hdf_blobs_[i]->shape(0), num); - } - // Default to identity permutation. - data_permutation_.clear(); - data_permutation_.resize(hdf_blobs_[0]->shape(0)); - for (int i = 0; i < hdf_blobs_[0]->shape(0); i++) - data_permutation_[i] = i; - - // Shuffle if needed. - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) - << " rows (shuffled)"; - } else { - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; - } + DLOG(INFO) << "Loading HDF5 file: " << filename; + hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT); + if (file_id < 0) { + LOG(FATAL) << "Failed opening HDF5 file: " << filename; + } + + int top_size = this->layer_param_.top_size(); + hdf_blobs_.resize(top_size); + + const int MIN_DATA_DIM = 1; + const int MAX_DATA_DIM = INT_MAX; + + for (int i = 0; i < top_size; ++i) { + hdf_blobs_[i] = shared_ptr < Blob > (new Blob()); + hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), + MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); + } + + herr_t status = H5Fclose(file_id); + CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename; + + // MinTopBlobs==1 guarantees at least one top blob + CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis."; + const int num = hdf_blobs_[0]->shape(0); + for (int i = 1; i < top_size; ++i) { + CHECK_EQ(hdf_blobs_[i]->shape(0), num); + } + // Default to identity permutation. + data_permutation_.clear(); + data_permutation_.resize(hdf_blobs_[0]->shape(0)); + for (int i = 0; i < hdf_blobs_[0]->shape(0); i++) + data_permutation_[i] = i; + + // Shuffle if needed. + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) + << " rows (shuffled)"; + } else { + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; + } } -template +template void HDF5DataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - // Refuse transformation parameters since HDF5 is totally generic. - CHECK(!this->layer_param_.has_transform_param()) << - this->type() << " does not transform data."; - // Read the source to parse the filenames. - const string& source = this->layer_param_.hdf5_data_param().source(); - LOG(INFO) << "Loading list of HDF5 filenames from: " << source; - hdf_filenames_.clear(); - std::ifstream source_file(source.c_str()); - if (source_file.is_open()) { - std::string line; - while (source_file >> line) { - hdf_filenames_.push_back(line); - } - } else { - LOG(FATAL) << "Failed to open source file: " << source; - } - source_file.close(); - num_files_ = hdf_filenames_.size(); - current_file_ = 0; - LOG(INFO) << "Number of HDF5 files: " << num_files_; - CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " - << source; - - file_permutation_.clear(); - file_permutation_.resize(num_files_); - // Default to identity permutation. - for (int i = 0; i < num_files_; i++) { - file_permutation_[i] = i; - } - - // Shuffle if needed. - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); - } - - // Load the first HDF5 file and initialize the line counter. - LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str()); - current_row_ = 0; - - // Reshape blobs. - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - const int top_size = this->layer_param_.top_size(); - vector top_shape; - for (int i = 0; i < top_size; ++i) { - top_shape.resize(hdf_blobs_[i]->num_axes()); - top_shape[0] = batch_size; - for (int j = 1; j < top_shape.size(); ++j) { - top_shape[j] = hdf_blobs_[i]->shape(j); - } - top[i]->Reshape(top_shape); - } + const vector*>& top) { + // Refuse transformation parameters since HDF5 is totally generic. + CHECK(!this->layer_param_.has_transform_param()) << + this->type() << " does not transform data."; + // Read the source to parse the filenames. + const string& source = this->layer_param_.hdf5_data_param().source(); + LOG(INFO) << "Loading list of HDF5 filenames from: " << source; + hdf_filenames_.clear(); + std::ifstream source_file(source.c_str()); + if (source_file.is_open()) { + std::string line; + while (source_file >> line) { + hdf_filenames_.push_back(line); + } + } else { + LOG(FATAL) << "Failed to open source file: " << source; + } + source_file.close(); + num_files_ = hdf_filenames_.size(); + current_file_ = 0; + LOG(INFO) << "Number of HDF5 files: " << num_files_; + CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " + << source; + + file_permutation_.clear(); + file_permutation_.resize(num_files_); + // Default to identity permutation. + for (int i = 0; i < num_files_; i++) { + file_permutation_[i] = i; + } + + // Shuffle if needed. + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); + } + + // Load the first HDF5 file and initialize the line counter. + LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str()); + current_row_ = 0; + + // Reshape blobs. + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + const int top_size = this->layer_param_.top_size(); + vector top_shape; + for (int i = 0; i < top_size; ++i) { + top_shape.resize(hdf_blobs_[i]->num_axes()); + top_shape[0] = batch_size; + for (int j = 1; j < top_shape.size(); ++j) { + top_shape[j] = hdf_blobs_[i]->shape(j); + } + top[i]->Reshape(top_shape); + } } -template +template void HDF5DataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - ++current_file_; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); - } - } + const vector*>& top) { + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + ++current_file_; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + caffe_copy(data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); + } + } } -template +template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - current_file_ += 1; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[j]->mutable_gpu_data(), CL_TRUE, i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], 0, NULL, NULL) ); - //caffe_copy(data_dim, - // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); - } - } + const vector*>& top) { + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + current_file_ += 1; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, + i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], + 0, NULL, NULL)); + //caffe_copy(data_dim, + // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); + } + } } - #ifdef CPU_ONLY STUB_GPU_FORWARD(HDF5DataLayer, Forward); #endif -INSTANTIATE_CLASS(HDF5DataLayer); -REGISTER_LAYER_CLASS(HDF5Data); +INSTANTIATE_CLASS (HDF5DataLayer); +REGISTER_LAYER_CLASS (HDF5Data); } // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index e2bd8e4c..cbb8a6fe 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -11,92 +11,100 @@ namespace caffe { -template +template void HDF5OutputLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - file_name_ = this->layer_param_.hdf5_output_param().file_name(); - file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, - H5P_DEFAULT); - CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; - file_opened_ = true; + const vector*>& top) { + file_name_ = this->layer_param_.hdf5_output_param().file_name(); + file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, + H5P_DEFAULT); + CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; + file_opened_ = true; } -template +template HDF5OutputLayer::~HDF5OutputLayer() { - if (file_opened_) { - herr_t status = H5Fclose(file_id_); - CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_; - } + if (file_opened_) { + herr_t status = H5Fclose(file_id_); + CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_; + } } -template +template void HDF5OutputLayer::SaveBlobs() { - // TODO: no limit on the number of blobs - LOG(INFO) << "Saving HDF5 file " << file_name_; - CHECK_EQ(data_blob_.num(), label_blob_.num()) << - "data blob and label blob must have the same batch size"; - hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); - hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); - LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; + // TODO: no limit on the number of blobs + LOG(INFO) << "Saving HDF5 file " << file_name_; + CHECK_EQ(data_blob_.num(), label_blob_.num()) << + "data blob and label blob must have the same batch size"; + hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); + hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); + LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; } -template +template void HDF5OutputLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + const vector*>& top) { + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); - } - SaveBlobs(); + for (int i = 0; i < bottom[0]->num(); ++i) { + caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], + &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], + &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + } + SaveBlobs(); } -template +template void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; + const vector& propagate_down, const vector*>& bottom) { + return; } -template +template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + const vector*>& top) { + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - for (int i = 0; i < bottom[0]->num(); ++i) { - OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[0]->gpu_data(), CL_TRUE, i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); - OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[1]->gpu_data(), CL_TRUE, i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL)); - } - SaveBlobs(); + for (int i = 0; i < bottom[0]->num(); ++i) { + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[0]->gpu_data(), CL_TRUE, + i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, + &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[1]->gpu_data(), CL_TRUE, + i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, + &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL)); + } + SaveBlobs(); } -template +template void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; + const vector& propagate_down, const vector*>& bottom) { + return; } #ifdef CPU_ONLY STUB_GPU(HDF5OutputLayer); #endif -INSTANTIATE_CLASS(HDF5OutputLayer); -REGISTER_LAYER_CLASS(HDF5Output); +INSTANTIATE_CLASS (HDF5OutputLayer); +REGISTER_LAYER_CLASS (HDF5Output); } // namespace caffe diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index a2fb2a18..e01e1d6a 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -10,73 +10,73 @@ namespace caffe { -template +template void HingeLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int count = bottom[0]->count(); - int dim = count / num; + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int count = bottom[0]->count(); + int dim = count / num; - caffe_copy(count, bottom_data, bottom_diff); - for (int i = 0; i < num; ++i) { - bottom_diff[i * dim + static_cast(label[i])] *= -1; - } - for (int i = 0; i < num; ++i) { - for (int j = 0; j < dim; ++j) { - bottom_diff[i * dim + j] = std::max( - Dtype(0), 1 + bottom_diff[i * dim + j]); - } - } - Dtype* loss = top[0]->mutable_cpu_data(); - switch (this->layer_param_.hinge_loss_param().norm()) { - case HingeLossParameter_Norm_L1: - loss[0] = caffe_cpu_asum(count, bottom_diff) / num; - break; - case HingeLossParameter_Norm_L2: - loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; - break; - default: - LOG(FATAL) << "Unknown Norm"; - } + caffe_copy(count, bottom_data, bottom_diff); + for (int i = 0; i < num; ++i) { + bottom_diff[i * dim + static_cast(label[i])] *= -1; + } + for (int i = 0; i < num; ++i) { + for (int j = 0; j < dim; ++j) { + bottom_diff[i * dim + j] = std::max( + Dtype(0), 1 + bottom_diff[i * dim + j]); + } + } + Dtype* loss = top[0]->mutable_cpu_data(); + switch (this->layer_param_.hinge_loss_param().norm()) { + case HingeLossParameter_Norm_L1: + loss[0] = caffe_cpu_asum(count, bottom_diff) / num; + break; + case HingeLossParameter_Norm_L2: + loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; + break; + default: + LOG(FATAL) << "Unknown Norm"; + } } -template +template void HingeLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int count = bottom[0]->count(); - int dim = count / num; + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int count = bottom[0]->count(); + int dim = count / num; - for (int i = 0; i < num; ++i) { - bottom_diff[i * dim + static_cast(label[i])] *= -1; - } + for (int i = 0; i < num; ++i) { + bottom_diff[i * dim + static_cast(label[i])] *= -1; + } - const Dtype loss_weight = top[0]->cpu_diff()[0]; - switch (this->layer_param_.hinge_loss_param().norm()) { - case HingeLossParameter_Norm_L1: - caffe_cpu_sign(count, bottom_diff, bottom_diff); - caffe_scal(count, loss_weight / num, bottom_diff); - break; - case HingeLossParameter_Norm_L2: - caffe_scal(count, loss_weight * 2 / num, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown Norm"; - } - } + const Dtype loss_weight = top[0]->cpu_diff()[0]; + switch (this->layer_param_.hinge_loss_param().norm()) { + case HingeLossParameter_Norm_L1: + caffe_cpu_sign(count, bottom_diff, bottom_diff); + caffe_scal(count, loss_weight / num, bottom_diff); + break; + case HingeLossParameter_Norm_L2: + caffe_scal(count, loss_weight * 2 / num, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown Norm"; + } + } } -INSTANTIATE_CLASS(HingeLossLayer); -REGISTER_LAYER_CLASS(HingeLoss); +INSTANTIATE_CLASS (HingeLossLayer); +REGISTER_LAYER_CLASS (HingeLoss); } // namespace caffe diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 7b667172..b29e47e2 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -7,115 +7,113 @@ namespace caffe { -template +template void Im2colLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - if (conv_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = conv_param.kernel_size(); - } else { - kernel_h_ = conv_param.kernel_h(); - kernel_w_ = conv_param.kernel_w(); - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!conv_param.has_pad_h()) { - pad_h_ = pad_w_ = conv_param.pad(); - } else { - pad_h_ = conv_param.pad_h(); - pad_w_ = conv_param.pad_w(); - } - if (!conv_param.has_stride_h()) { - stride_h_ = stride_w_ = conv_param.stride(); - } else { - stride_h_ = conv_param.stride_h(); - stride_w_ = conv_param.stride_w(); - } + const vector*>& top) { + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK(!conv_param.has_kernel_size() != + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK(conv_param.has_kernel_size() || + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK((!conv_param.has_pad() && conv_param.has_pad_h() + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK((!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } } -template +template void Im2colLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - top[0]->Reshape( - bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, - (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, - (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + top[0]->Reshape( + bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, + (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, + (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); } -template +template void Im2colLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data + top[0]->offset(n)); + } } -template +template void Im2colLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); + } } -template +template void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data, top[0]->offset(n)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data, top[0]->offset(n)); + } } -template +template void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); + } } - - #ifdef CPU_ONLY STUB_GPU(Im2colLayer); #endif -INSTANTIATE_CLASS(Im2colLayer); -REGISTER_LAYER_CLASS(Im2col); +INSTANTIATE_CLASS (Im2colLayer); +REGISTER_LAYER_CLASS (Im2col); } // namespace caffe diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 18c035cb..846bcc34 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -15,145 +15,145 @@ namespace caffe { -template +template ImageDataLayer::~ImageDataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } -template +template void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int new_height = this->layer_param_.image_data_param().new_height(); - const int new_width = this->layer_param_.image_data_param().new_width(); - const bool is_color = this->layer_param_.image_data_param().is_color(); - string root_folder = this->layer_param_.image_data_param().root_folder(); - - CHECK((new_height == 0 && new_width == 0) || - (new_height > 0 && new_width > 0)) << "Current implementation requires " - "new_height and new_width to be set at the same time."; - // Read the file with filenames and labels - const string& source = this->layer_param_.image_data_param().source(); - LOG(INFO) << "Opening file " << source; - std::ifstream infile(source.c_str()); - string filename; - int label; - while (infile >> filename >> label) { - lines_.push_back(std::make_pair(filename, label)); - } - - if (this->layer_param_.image_data_param().shuffle()) { - // randomly shuffle data - LOG(INFO) << "Shuffling data"; - const unsigned int prefetch_rng_seed = caffe_rng_rand(); - prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); - ShuffleImages(); - } - LOG(INFO) << "A total of " << lines_.size() << " images."; - - lines_id_ = 0; - // Check if we would need to randomly skip a few data points - if (this->layer_param_.image_data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.image_data_param().rand_skip(); - LOG(INFO) << "Skipping first " << skip << " data points."; - CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; - lines_id_ = skip; - } - // Read an image, and use it to initialize the top blob. - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - // Use data_transformer to infer the expected blob shape from a cv_image. - vector top_shape = this->data_transformer_->InferBlobShape(cv_img); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data and top[0] according to the batch_size. - const int batch_size = this->layer_param_.image_data_param().batch_size(); - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); - top[0]->ReshapeLike(this->prefetch_data_); - - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - vector label_shape(1, batch_size); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); + const vector*>& top) { + const int new_height = this->layer_param_.image_data_param().new_height(); + const int new_width = this->layer_param_.image_data_param().new_width(); + const bool is_color = this->layer_param_.image_data_param().is_color(); + string root_folder = this->layer_param_.image_data_param().root_folder(); + + CHECK((new_height == 0 && new_width == 0) || + (new_height > 0 && new_width > 0)) << "Current implementation requires " + "new_height and new_width to be set at the same time."; + // Read the file with filenames and labels + const string& source = this->layer_param_.image_data_param().source(); + LOG(INFO) << "Opening file " << source; + std::ifstream infile(source.c_str()); + string filename; + int label; + while (infile >> filename >> label) { + lines_.push_back(std::make_pair(filename, label)); + } + + if (this->layer_param_.image_data_param().shuffle()) { + // randomly shuffle data + LOG(INFO) << "Shuffling data"; + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + ShuffleImages(); + } + LOG(INFO) << "A total of " << lines_.size() << " images."; + + lines_id_ = 0; + // Check if we would need to randomly skip a few data points + if (this->layer_param_.image_data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() % + this->layer_param_.image_data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; + lines_id_ = skip; + } + // Read an image, and use it to initialize the top blob. + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + // Use data_transformer to infer the expected blob shape from a cv_image. + vector top_shape = this->data_transformer_->InferBlobShape(cv_img); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data and top[0] according to the batch_size. + const int batch_size = this->layer_param_.image_data_param().batch_size(); + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); + top[0]->ReshapeLike(this->prefetch_data_); + + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + vector label_shape(1, batch_size); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); } -template +template void ImageDataLayer::ShuffleImages() { - caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); - shuffle(lines_.begin(), lines_.end(), prefetch_rng); + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + shuffle(lines_.begin(), lines_.end(), prefetch_rng); } // This function is used to create a thread that prefetches the data. -template +template void ImageDataLayer::InternalThreadEntry() { - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - CHECK(this->prefetch_data_.count()); - CHECK(this->transformed_data_.count()); - ImageDataParameter image_data_param = this->layer_param_.image_data_param(); - const int batch_size = image_data_param.batch_size(); - const int new_height = image_data_param.new_height(); - const int new_width = image_data_param.new_width(); - const bool is_color = image_data_param.is_color(); - string root_folder = image_data_param.root_folder(); - - // Reshape according to the first image of each batch - // on single input batches allows for inputs of varying dimension. - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - // Use data_transformer to infer the expected blob shape from a cv_img. - vector top_shape = this->data_transformer_->InferBlobShape(cv_img); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data according to the batch_size. - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); - - Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data(); - - // datum scales - const int lines_size = lines_.size(); - for (int item_id = 0; item_id < batch_size; ++item_id) { - // get a blob - timer.Start(); - CHECK_GT(lines_size, lines_id_); - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first; - read_time += timer.MicroSeconds(); - timer.Start(); - // Apply transformations (mirror, crop...) to the image - int offset = this->prefetch_data_.offset(item_id); - this->transformed_data_.set_cpu_data(prefetch_data + offset); - this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); - trans_time += timer.MicroSeconds(); - - prefetch_label[item_id] = lines_[lines_id_].second; - // go to the next iter - lines_id_++; - if (lines_id_ >= lines_size) { - // We have reached the end. Restart from the first. - DLOG(INFO) << "Restarting data prefetching from start."; - lines_id_ = 0; - if (this->layer_param_.image_data_param().shuffle()) { - ShuffleImages(); - } - } - } - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + CHECK(this->prefetch_data_.count()); + CHECK(this->transformed_data_.count()); + ImageDataParameter image_data_param = this->layer_param_.image_data_param(); + const int batch_size = image_data_param.batch_size(); + const int new_height = image_data_param.new_height(); + const int new_width = image_data_param.new_width(); + const bool is_color = image_data_param.is_color(); + string root_folder = image_data_param.root_folder(); + + // Reshape according to the first image of each batch + // on single input batches allows for inputs of varying dimension. + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + // Use data_transformer to infer the expected blob shape from a cv_img. + vector top_shape = this->data_transformer_->InferBlobShape(cv_img); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data according to the batch_size. + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); + + Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data(); + + // datum scales + const int lines_size = lines_.size(); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a blob + timer.Start(); + CHECK_GT(lines_size, lines_id_); + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first; + read_time += timer.MicroSeconds(); + timer.Start(); + // Apply transformations (mirror, crop...) to the image + int offset = this->prefetch_data_.offset(item_id); + this->transformed_data_.set_cpu_data(prefetch_data + offset); + this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); + trans_time += timer.MicroSeconds(); + + prefetch_label[item_id] = lines_[lines_id_].second; + // go to the next iter + lines_id_++; + if (lines_id_ >= lines_size) { + // We have reached the end. Restart from the first. + DLOG(INFO) << "Restarting data prefetching from start."; + lines_id_ = 0; + if (this->layer_param_.image_data_param().shuffle()) { + ShuffleImages(); + } + } + } + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(ImageDataLayer); -REGISTER_LAYER_CLASS(ImageData); +INSTANTIATE_CLASS (ImageDataLayer); +REGISTER_LAYER_CLASS (ImageData); } // namespace caffe diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index a1e0b40d..e5294a7e 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -10,101 +10,100 @@ namespace caffe { -template +template void InfogainLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); - if (bottom.size() < 3) { - CHECK(this->layer_param_.infogain_loss_param().has_source()) - << "Infogain matrix source must be specified."; - BlobProto blob_proto; - ReadProtoFromBinaryFile( - this->layer_param_.infogain_loss_param().source(), &blob_proto); - infogain_.FromProto(blob_proto); - } + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + if (bottom.size() < 3) { + CHECK(this->layer_param_.infogain_loss_param().has_source()) + << "Infogain matrix source must be specified."; + BlobProto blob_proto; + ReadProtoFromBinaryFile( + this->layer_param_.infogain_loss_param().source(), &blob_proto); + infogain_.FromProto(blob_proto); + } } -template +template void InfogainLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - Blob* infogain = NULL; - if (bottom.size() < 3) { - infogain = &infogain_; - } else { - infogain = bottom[2]; - } - CHECK_EQ(bottom[1]->channels(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); - const int num = bottom[0]->num(); - const int dim = bottom[0]->count() / num; - CHECK_EQ(infogain->num(), 1); - CHECK_EQ(infogain->channels(), 1); - CHECK_EQ(infogain->height(), dim); - CHECK_EQ(infogain->width(), dim); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + Blob < Dtype > *infogain = NULL; + if (bottom.size() < 3) { + infogain = &infogain_; + } else { + infogain = bottom[2]; + } + CHECK_EQ(bottom[1]->channels(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); + const int num = bottom[0]->num(); + const int dim = bottom[0]->count() / num; + CHECK_EQ(infogain->num(), 1); + CHECK_EQ(infogain->channels(), 1); + CHECK_EQ(infogain->height(), dim); + CHECK_EQ(infogain->width(), dim); } - -template +template void InfogainLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = NULL; - if (bottom.size() < 3) { - infogain_mat = infogain_.cpu_data(); - } else { - infogain_mat = bottom[2]->cpu_data(); - } - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - Dtype loss = 0; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - for (int j = 0; j < dim; ++j) { - Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); - loss -= infogain_mat[label * dim + j] * log(prob); - } - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* infogain_mat = NULL; + if (bottom.size() < 3) { + infogain_mat = infogain_.cpu_data(); + } else { + infogain_mat = bottom[2]->cpu_data(); + } + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + Dtype loss = 0; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + for (int j = 0; j < dim; ++j) { + Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); + loss -= infogain_mat[label * dim + j] * log(prob); + } + } + top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void InfogainLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down.size() > 2 && propagate_down[2]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to infogain inputs."; - } - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = NULL; - if (bottom.size() < 3) { - infogain_mat = infogain_.cpu_data(); - } else { - infogain_mat = bottom[2]->cpu_data(); - } - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - const Dtype scale = - top[0]->cpu_diff()[0] / num; - for (int i = 0; i < num; ++i) { - const int label = static_cast(bottom_label[i]); - for (int j = 0; j < dim; ++j) { - Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); - bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob; - } - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down.size() > 2 && propagate_down[2]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to infogain inputs."; + } + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* infogain_mat = NULL; + if (bottom.size() < 3) { + infogain_mat = infogain_.cpu_data(); + } else { + infogain_mat = bottom[2]->cpu_data(); + } + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + const Dtype scale = -top[0]->cpu_diff()[0] / num; + for (int i = 0; i < num; ++i) { + const int label = static_cast(bottom_label[i]); + for (int j = 0; j < dim; ++j) { + Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); + bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob; + } + } + } } -INSTANTIATE_CLASS(InfogainLossLayer); -REGISTER_LAYER_CLASS(InfogainLoss); +INSTANTIATE_CLASS (InfogainLossLayer); +REGISTER_LAYER_CLASS (InfogainLoss); } // namespace caffe diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 676650c2..e563aa21 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -9,164 +9,168 @@ namespace caffe { -template +template void InnerProductLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int num_output = this->layer_param_.inner_product_param().num_output(); - bias_term_ = this->layer_param_.inner_product_param().bias_term(); - N_ = num_output; - const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); - // Dimensions starting from "axis" are "flattened" into a single - // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), - // and axis == 1, N inner products with dimension CHW are performed. - K_ = bottom[0]->count(axis); - // Check if we need to set up the weights - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - if (bias_term_) { - this->blobs_.resize(2); - } else { - this->blobs_.resize(1); - } - // Intialize the weight - vector weight_shape(2); - weight_shape[0] = N_; - weight_shape[1] = K_; - this->blobs_[0].reset(new Blob(weight_shape)); - // fill the weights - shared_ptr > weight_filler(GetFiller( - this->layer_param_.inner_product_param().weight_filler())); - weight_filler->Fill(this->blobs_[0].get()); - // If necessary, intiialize and fill the bias term - if (bias_term_) { - vector bias_shape(1, N_); - this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr > bias_filler(GetFiller( - this->layer_param_.inner_product_param().bias_filler())); - bias_filler->Fill(this->blobs_[1].get()); - } - } // parameter initialization - this->param_propagate_down_.resize(this->blobs_.size(), true); + const vector*>& top) { + const int num_output = this->layer_param_.inner_product_param().num_output(); + bias_term_ = this->layer_param_.inner_product_param().bias_term(); + N_ = num_output; + const int axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.inner_product_param().axis()); + // Dimensions starting from "axis" are "flattened" into a single + // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), + // and axis == 1, N inner products with dimension CHW are performed. + K_ = bottom[0]->count(axis); + // Check if we need to set up the weights + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + if (bias_term_) { + this->blobs_.resize(2); + } else { + this->blobs_.resize(1); + } + // Intialize the weight + vector weight_shape(2); + weight_shape[0] = N_; + weight_shape[1] = K_; + this->blobs_[0].reset(new Blob(weight_shape)); + // fill the weights + shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( + this->layer_param_.inner_product_param().weight_filler())); + weight_filler->Fill(this->blobs_[0].get()); + // If necessary, intiialize and fill the bias term + if (bias_term_) { + vector bias_shape(1, N_); + this->blobs_[1].reset(new Blob(bias_shape)); + shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( + this->layer_param_.inner_product_param().bias_filler())); + bias_filler->Fill(this->blobs_[1].get()); + } + } // parameter initialization + this->param_propagate_down_.resize(this->blobs_.size(), true); } -template +template void InnerProductLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - // Figure out the dimensions - const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); - const int new_K = bottom[0]->count(axis); - CHECK_EQ(K_, new_K) - << "Input size incompatible with inner product parameters."; - // The first "axis" dimensions are independent inner products; the total - // number of these is M_, the product over these dimensions. - M_ = bottom[0]->count(0, axis); - // The top shape will be the bottom shape with the flattened axes dropped, - // and replaced by a single axis with dimension num_output (N_). - vector top_shape = bottom[0]->shape(); - top_shape.resize(axis + 1); - top_shape[axis] = N_; - top[0]->Reshape(top_shape); - // Set up the bias multiplier - if (bias_term_) { - vector bias_shape(1, M_); - bias_multiplier_.Reshape(bias_shape); - caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data()); - } + const vector*>& top) { + // Figure out the dimensions + const int axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.inner_product_param().axis()); + const int new_K = bottom[0]->count(axis); + CHECK_EQ(K_, new_K) + << "Input size incompatible with inner product parameters."; + // The first "axis" dimensions are independent inner products; the total + // number of these is M_, the product over these dimensions. + M_ = bottom[0]->count(0, axis); + // The top shape will be the bottom shape with the flattened axes dropped, + // and replaced by a single axis with dimension num_output (N_). + vector top_shape = bottom[0]->shape(); + top_shape.resize(axis + 1); + top_shape[axis] = N_; + top[0]->Reshape(top_shape); + // Set up the bias multiplier + if (bias_term_) { + vector bias_shape(1, M_); + bias_multiplier_.Reshape(bias_shape); + caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data()); + } } -template +template void InnerProductLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); - if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.cpu_data(), - this->blobs_[1]->cpu_data(), (Dtype)1., top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const Dtype* weight = this->blobs_[0]->cpu_data(); + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., + bottom_data, weight, (Dtype) 0., top_data); + if (bias_term_) { + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., + bias_multiplier_.cpu_data(), + this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); + } } -template +template void InnerProductLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - // Gradient with respect to weight - caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->cpu_diff(); - // Gradient with respect to bias - caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.cpu_data(), (Dtype)1., - this->blobs_[1]->mutable_cpu_diff()); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - // Gradient with respect to bottom data - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., - bottom[0]->mutable_cpu_diff()); - } + const vector& propagate_down, + const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Gradient with respect to weight + caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., + top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->cpu_diff(); + // Gradient with respect to bias + caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff, + bias_multiplier_.cpu_data(), (Dtype) 1., + this->blobs_[1]->mutable_cpu_diff()); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + // Gradient with respect to bottom data + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., + top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., + bottom[0]->mutable_cpu_diff()); + } } -template +template void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1., - bottom_data, 0, weight, 0, (Dtype)0., top_data, 0); - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.gpu_data(),0, - this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., + bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); + if (bias_term_) { + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., + bias_multiplier_.gpu_data(), 0, + this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); + } } -template +template void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff, - (size_t)0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), - (size_t)0, (Dtype)0., 1, - this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0., - bottom[0]->mutable_gpu_diff(), 0); - } + const vector& propagate_down, + const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Gradient with respect to weight + caffe_gpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., + top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bias + caffe_gpu_gemv < Dtype + > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, + (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), + (size_t) 0, (Dtype) 0., 1, + this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bottom data + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., + top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., + bottom[0]->mutable_gpu_diff(), 0); + } } #ifdef CPU_ONLY STUB_GPU(InnerProductLayer); #endif -INSTANTIATE_CLASS(InnerProductLayer); -REGISTER_LAYER_CLASS(InnerProduct); +INSTANTIATE_CLASS (InnerProductLayer); +REGISTER_LAYER_CLASS (InnerProduct); } // namespace caffe diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 268c5f5b..e388dfef 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -7,128 +7,130 @@ namespace caffe { -template +template void LogLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - const Dtype base = this->layer_param_.log_param().base(); - if (base != Dtype(-1)) { - CHECK_GT(base, 0) << "base must be strictly positive."; - } - // If base == -1, interpret the base as e and set log_base = 1 exactly. - // Otherwise, calculate its log explicitly. - const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; - base_scale_ = Dtype(1) / log_base; - CHECK(!isnan(base_scale_)) - << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; - CHECK(!isinf(base_scale_)) - << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; - input_scale_ = this->layer_param_.log_param().scale(); - input_shift_ = this->layer_param_.log_param().shift(); - backward_num_scale_ = input_scale_ / log_base; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + const Dtype base = this->layer_param_.log_param().base(); + if (base != Dtype(-1)) { + CHECK_GT(base, 0) << "base must be strictly positive."; + } + // If base == -1, interpret the base as e and set log_base = 1 exactly. + // Otherwise, calculate its log explicitly. + const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); + CHECK(!isnan(log_base)) + << "NaN result: log(base) = log(" << base << ") = " << log_base; + CHECK(!isinf(log_base)) + << "Inf result: log(base) = log(" << base << ") = " << log_base; + base_scale_ = Dtype(1) / log_base; + CHECK(!isnan(base_scale_)) + << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; + CHECK(!isinf(base_scale_)) + << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; + input_scale_ = this->layer_param_.log_param().scale(); + input_shift_ = this->layer_param_.log_param().shift(); + backward_num_scale_ = input_scale_ / log_base; } -template +template void LogLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_add_scalar(count, input_shift_, top_data); - } - caffe_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_scal(count, base_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_log(count, bottom_data, top_data); + } else { + caffe_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_add_scalar(count, input_shift_, top_data); + } + caffe_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_scal(count, base_scale_, top_data); + } } -template +template void LogLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_add_scalar(count, input_shift_, bottom_diff); - } - caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_scal(count, backward_num_scale_, bottom_diff); - } - caffe_mul(count, top_diff, bottom_diff, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_add_scalar(count, input_shift_, bottom_diff); + } + caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_scal(count, backward_num_scale_, bottom_diff); + } + caffe_mul(count, top_diff, bottom_diff, bottom_diff); } -template +template void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_gpu_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_gpu_log(count, bottom_data, top_data); + } else { + caffe_gpu_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, top_data); + } + caffe_gpu_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_gpu_scal(count, base_scale_, top_data); + } } -template +template void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, bottom_diff); + } + caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_gpu_scal(count, backward_num_scale_, bottom_diff); + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } - - #ifdef CPU_ONLY STUB_GPU(LogLayer); #endif -INSTANTIATE_CLASS(LogLayer); -REGISTER_LAYER_CLASS(Log); +INSTANTIATE_CLASS (LogLayer); +REGISTER_LAYER_CLASS (Log); } // namespace caffe diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp index 3496a5c2..503014f5 100644 --- a/src/caffe/layers/loss_layer.cpp +++ b/src/caffe/layers/loss_layer.cpp @@ -10,24 +10,24 @@ namespace caffe { -template +template void LossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - // LossLayers have a non-zero (1) loss by default. - if (this->layer_param_.loss_weight_size() == 0) { - this->layer_param_.add_loss_weight(Dtype(1)); - } + const vector*>& bottom, const vector*>& top) { + // LossLayers have a non-zero (1) loss by default. + if (this->layer_param_.loss_weight_size() == 0) { + this->layer_param_.add_loss_weight(Dtype(1)); + } } -template +template void LossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - CHECK_EQ(bottom[0]->num(), bottom[1]->num()) - << "The data and label should have the same number."; - vector loss_shape(0); // Loss layers output a scalar; 0 axes. - top[0]->Reshape(loss_shape); + const vector*>& bottom, const vector*>& top) { + CHECK_EQ(bottom[0]->num(), bottom[1]->num()) + << "The data and label should have the same number."; + vector loss_shape(0); // Loss layers output a scalar; 0 axes. + top[0]->Reshape(loss_shape); } -INSTANTIATE_CLASS(LossLayer); +INSTANTIATE_CLASS (LossLayer); } // namespace caffe diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index d2f1c247..0f936f22 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -8,311 +8,311 @@ namespace caffe { -template +template void LRNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - size_ = this->layer_param_.lrn_param().local_size(); - CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; - pre_pad_ = (size_ - 1) / 2; - alpha_ = this->layer_param_.lrn_param().alpha(); - beta_ = this->layer_param_.lrn_param().beta(); - k_ = this->layer_param_.lrn_param().k(); - if (this->layer_param_.lrn_param().norm_region() == - LRNParameter_NormRegion_WITHIN_CHANNEL) { - // Set up split_layer_ to use inputs in the numerator and denominator. - split_top_vec_.clear(); - split_top_vec_.push_back(&product_input_); - split_top_vec_.push_back(&square_input_); - LayerParameter split_param; - split_layer_.reset(new SplitLayer(split_param)); - split_layer_->SetUp(bottom, split_top_vec_); - // Set up square_layer_ to square the inputs. - square_bottom_vec_.clear(); - square_top_vec_.clear(); - square_bottom_vec_.push_back(&square_input_); - square_top_vec_.push_back(&square_output_); - LayerParameter square_param; - square_param.mutable_power_param()->set_power(Dtype(2)); - square_layer_.reset(new PowerLayer(square_param)); - square_layer_->SetUp(square_bottom_vec_, square_top_vec_); - // Set up pool_layer_ to sum over square neighborhoods of the input. - pool_top_vec_.clear(); - pool_top_vec_.push_back(&pool_output_); - LayerParameter pool_param; - pool_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - pool_param.mutable_pooling_param()->set_pad(pre_pad_); - pool_param.mutable_pooling_param()->set_kernel_size(size_); - pool_layer_.reset(new PoolingLayer(pool_param)); - pool_layer_->SetUp(square_top_vec_, pool_top_vec_); - // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is - // the sum of a squared neighborhood (the output of pool_layer_). - power_top_vec_.clear(); - power_top_vec_.push_back(&power_output_); - LayerParameter power_param; - power_param.mutable_power_param()->set_power(-beta_); - power_param.mutable_power_param()->set_scale(alpha_); - power_param.mutable_power_param()->set_shift(Dtype(1)); - power_layer_.reset(new PowerLayer(power_param)); - power_layer_->SetUp(pool_top_vec_, power_top_vec_); - // Set up a product_layer_ to compute outputs by multiplying inputs by the - // inverse demoninator computed by the power layer. - product_bottom_vec_.clear(); - product_bottom_vec_.push_back(&product_input_); - product_bottom_vec_.push_back(&power_output_); - LayerParameter product_param; - EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param(); - eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD); - product_layer_.reset(new EltwiseLayer(product_param)); - product_layer_->SetUp(product_bottom_vec_, top); - } + const vector*>& top) { + size_ = this->layer_param_.lrn_param().local_size(); + CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; + pre_pad_ = (size_ - 1) / 2; + alpha_ = this->layer_param_.lrn_param().alpha(); + beta_ = this->layer_param_.lrn_param().beta(); + k_ = this->layer_param_.lrn_param().k(); + if (this->layer_param_.lrn_param().norm_region() == + LRNParameter_NormRegion_WITHIN_CHANNEL) { + // Set up split_layer_ to use inputs in the numerator and denominator. + split_top_vec_.clear(); + split_top_vec_.push_back(&product_input_); + split_top_vec_.push_back(&square_input_); + LayerParameter split_param; + split_layer_.reset(new SplitLayer(split_param)); + split_layer_->SetUp(bottom, split_top_vec_); + // Set up square_layer_ to square the inputs. + square_bottom_vec_.clear(); + square_top_vec_.clear(); + square_bottom_vec_.push_back(&square_input_); + square_top_vec_.push_back(&square_output_); + LayerParameter square_param; + square_param.mutable_power_param()->set_power(Dtype(2)); + square_layer_.reset(new PowerLayer(square_param)); + square_layer_->SetUp(square_bottom_vec_, square_top_vec_); + // Set up pool_layer_ to sum over square neighborhoods of the input. + pool_top_vec_.clear(); + pool_top_vec_.push_back(&pool_output_); + LayerParameter pool_param; + pool_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + pool_param.mutable_pooling_param()->set_pad(pre_pad_); + pool_param.mutable_pooling_param()->set_kernel_size(size_); + pool_layer_.reset(new PoolingLayer(pool_param)); + pool_layer_->SetUp(square_top_vec_, pool_top_vec_); + // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is + // the sum of a squared neighborhood (the output of pool_layer_). + power_top_vec_.clear(); + power_top_vec_.push_back(&power_output_); + LayerParameter power_param; + power_param.mutable_power_param()->set_power(-beta_); + power_param.mutable_power_param()->set_scale(alpha_); + power_param.mutable_power_param()->set_shift(Dtype(1)); + power_layer_.reset(new PowerLayer(power_param)); + power_layer_->SetUp(pool_top_vec_, power_top_vec_); + // Set up a product_layer_ to compute outputs by multiplying inputs by the + // inverse demoninator computed by the power layer. + product_bottom_vec_.clear(); + product_bottom_vec_.push_back(&product_input_); + product_bottom_vec_.push_back(&power_output_); + LayerParameter product_param; + EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param(); + eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD); + product_layer_.reset(new EltwiseLayer(product_param)); + product_layer_->SetUp(product_bottom_vec_, top); + } } -template +template void LRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - top[0]->Reshape(num_, channels_, height_, width_); - scale_.Reshape(num_, channels_, height_, width_); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - split_layer_->Reshape(bottom, split_top_vec_); - square_layer_->Reshape(square_bottom_vec_, square_top_vec_); - pool_layer_->Reshape(square_top_vec_, pool_top_vec_); - power_layer_->Reshape(pool_top_vec_, power_top_vec_); - product_layer_->Reshape(product_bottom_vec_, top); - break; - } - LFSkernel = clCreateKernel(amdDevice.Program,"LRNFillScalefloat",NULL); - LCDkernel = clCreateKernel(amdDevice.Program,"LRNComputeDifffloat",NULL); - LCOkernel = clCreateKernel(amdDevice.Program,"LRNComputeOutputfloat",NULL); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + top[0]->Reshape(num_, channels_, height_, width_); + scale_.Reshape(num_, channels_, height_, width_); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + split_layer_->Reshape(bottom, split_top_vec_); + square_layer_->Reshape(square_bottom_vec_, square_top_vec_); + pool_layer_->Reshape(square_top_vec_, pool_top_vec_); + power_layer_->Reshape(pool_top_vec_, power_top_vec_); + product_layer_->Reshape(product_bottom_vec_, top); + break; + } + LFSkernel = clCreateKernel(amdDevice.Program, "LRNFillScalefloat", NULL); + LCDkernel = clCreateKernel(amdDevice.Program, "LRNComputeDifffloat", NULL); + LCOkernel = clCreateKernel(amdDevice.Program, "LRNComputeOutputfloat", NULL); } -template +template void LRNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_cpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_cpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } -template +template void LRNLayer::CrossChannelForward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); - // start with the constant value - for (int i = 0; i < scale_.count(); ++i) { - scale_data[i] = k_; - } - Blob padded_square(1, channels_ + size_ - 1, height_, width_); - Dtype* padded_square_data = padded_square.mutable_cpu_data(); - caffe_set(padded_square.count(), Dtype(0), padded_square_data); - Dtype alpha_over_size = alpha_ / size_; - // go through the images - for (int n = 0; n < num_; ++n) { - // compute the padded square - caffe_sqr(channels_ * height_ * width_, - bottom_data + bottom[0]->offset(n), - padded_square_data + padded_square.offset(0, pre_pad_)); - // Create the first channel scale - for (int c = 0; c < size_; ++c) { - caffe_axpy(height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c), - scale_data + scale_.offset(n, 0)); - } - for (int c = 1; c < channels_; ++c) { - // copy previous scale - caffe_copy(height_ * width_, - scale_data + scale_.offset(n, c - 1), - scale_data + scale_.offset(n, c)); - // add head - caffe_axpy(height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c + size_ - 1), - scale_data + scale_.offset(n, c)); - // subtract tail - caffe_axpy(height_ * width_, -alpha_over_size, - padded_square_data + padded_square.offset(0, c - 1), - scale_data + scale_.offset(n, c)); - } - } + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + Dtype* scale_data = scale_.mutable_cpu_data(); + // start with the constant value + for (int i = 0; i < scale_.count(); ++i) { + scale_data[i] = k_; + } + Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_); + Dtype* padded_square_data = padded_square.mutable_cpu_data(); + caffe_set(padded_square.count(), Dtype(0), padded_square_data); + Dtype alpha_over_size = alpha_ / size_; + // go through the images + for (int n = 0; n < num_; ++n) { + // compute the padded square + caffe_sqr(channels_ * height_ * width_, + bottom_data + bottom[0]->offset(n), + padded_square_data + padded_square.offset(0, pre_pad_)); + // Create the first channel scale + for (int c = 0; c < size_; ++c) { + caffe_axpy < Dtype > (height_ * width_, alpha_over_size, + padded_square_data + padded_square.offset(0, c), + scale_data + scale_.offset(n, 0)); + } + for (int c = 1; c < channels_; ++c) { + // copy previous scale + caffe_copy < Dtype > (height_ * width_, + scale_data + scale_.offset(n, c - 1), + scale_data + scale_.offset(n, c)); + // add head + caffe_axpy < Dtype > (height_ * width_, alpha_over_size, + padded_square_data + padded_square.offset(0, c + size_ - 1), + scale_data + scale_.offset(n, c)); + // subtract tail + caffe_axpy < Dtype > (height_ * width_, -alpha_over_size, + padded_square_data + padded_square.offset(0, c - 1), + scale_data + scale_.offset(n, c)); + } + } - // In the end, compute output - caffe_powx(scale_.count(), scale_data, -beta_, top_data); - caffe_mul(scale_.count(), top_data, bottom_data, top_data); + // In the end, compute output + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data); + caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data); } -template +template void LRNLayer::WithinChannelForward( - const vector*>& bottom, const vector*>& top) { - split_layer_->Forward(bottom, split_top_vec_); - square_layer_->Forward(square_bottom_vec_, square_top_vec_); - pool_layer_->Forward(square_top_vec_, pool_top_vec_); - power_layer_->Forward(pool_top_vec_, power_top_vec_); - product_layer_->Forward(product_bottom_vec_, top); + const vector*>& bottom, const vector*>& top) { + split_layer_->Forward(bottom, split_top_vec_); + square_layer_->Forward(square_bottom_vec_, square_top_vec_); + pool_layer_->Forward(square_top_vec_, pool_top_vec_); + power_layer_->Forward(pool_top_vec_, power_top_vec_); + product_layer_->Forward(product_bottom_vec_, top); } -template +template void LRNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_cpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_cpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } -template +template void LRNLayer::CrossChannelBackward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* scale_data = scale_.cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - Blob padded_ratio(1, channels_ + size_ - 1, height_, width_); - Blob accum_ratio(1, 1, height_, width_); - Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); - Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); - // We hack a little bit by using the diff() to store an additional result - Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); - caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); - Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* scale_data = scale_.cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_); + Blob < Dtype > accum_ratio(1, 1, height_, width_); + Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); + Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); + // We hack a little bit by using the diff() to store an additional result + Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); + caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); + Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; - caffe_powx(scale_.count(), scale_data, -beta_, bottom_diff); - caffe_mul(scale_.count(), top_diff, bottom_diff, bottom_diff); + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff); + caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff); - // go through individual data - int inverse_pre_pad = size_ - (size_ + 1) / 2; - for (int n = 0; n < num_; ++n) { - int block_offset = scale_.offset(n); - // first, compute diff_i * y_i / s_i - caffe_mul(channels_ * height_ * width_, - top_diff + block_offset, top_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); - caffe_div(channels_ * height_ * width_, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), - scale_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); - // Now, compute the accumulated ratios and the bottom diff - caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); - for (int c = 0; c < size_ - 1; ++c) { - caffe_axpy(height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); - } - for (int c = 0; c < channels_; ++c) { - caffe_axpy(height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), - accum_ratio_data); - // compute bottom diff - caffe_mul(height_ * width_, - bottom_data + top[0]->offset(n, c), - accum_ratio_data, accum_ratio_times_bottom); - caffe_axpy(height_ * width_, -cache_ratio_value, - accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); - caffe_axpy(height_ * width_, -1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); - } - } + // go through individual data + int inverse_pre_pad = size_ - (size_ + 1) / 2; + for (int n = 0; n < num_; ++n) { + int block_offset = scale_.offset(n); + // first, compute diff_i * y_i / s_i + caffe_mul < Dtype > (channels_ * height_ * width_, + top_diff + block_offset, top_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + caffe_div < Dtype > (channels_ * height_ * width_, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), + scale_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + // Now, compute the accumulated ratios and the bottom diff + caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); + for (int c = 0; c < size_ - 1; ++c) { + caffe_axpy < Dtype > (height_ * width_, 1., + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + } + for (int c = 0; c < channels_; ++c) { + caffe_axpy < Dtype > (height_ * width_, 1., + padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), + accum_ratio_data); + // compute bottom diff + caffe_mul < Dtype > (height_ * width_, + bottom_data + top[0]->offset(n, c), + accum_ratio_data, accum_ratio_times_bottom); + caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value, + accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); + caffe_axpy < Dtype > (height_ * width_, -1., + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + } + } } -template +template void LRNLayer::WithinChannelBackward( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - vector product_propagate_down(2, true); - product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); - power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); - pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); - square_layer_->Backward(square_top_vec_, propagate_down, - square_bottom_vec_); - split_layer_->Backward(split_top_vec_, propagate_down, bottom); - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + vector product_propagate_down(2, true); + product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); + power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); + pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); + square_layer_->Backward(square_top_vec_, propagate_down, + square_bottom_vec_); + split_layer_->Backward(split_top_vec_, propagate_down, bottom); + } } -template +template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale(LFSkernel, - n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput(LCOkernel, - n_threads, bottom_data, scale_data, -beta_, top_data); + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(LFSkernel, + n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(LCOkernel, + n_threads, bottom_data, scale_data, -beta_, top_data); } -template +template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff(LCDkernel, - n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(LCDkernel, + n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); } -template +template void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_gpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_gpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } -template +template void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_gpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_gpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } #ifdef CPU_ONLY STUB_GPU(LRNLayer); @@ -320,7 +320,7 @@ STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward); #endif -INSTANTIATE_CLASS(LRNLayer); -REGISTER_LAYER_CLASS(LRN); +INSTANTIATE_CLASS (LRNLayer); +REGISTER_LAYER_CLASS (LRN); } // namespace caffe diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 42de4198..2cd04f93 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -8,114 +8,114 @@ namespace caffe { -template +template void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - batch_size_ = this->layer_param_.memory_data_param().batch_size(); - channels_ = this->layer_param_.memory_data_param().channels(); - height_ = this->layer_param_.memory_data_param().height(); - width_ = this->layer_param_.memory_data_param().width(); - size_ = channels_ * height_ * width_; - CHECK_GT(batch_size_ * size_, 0) << - "batch_size, channels, height, and width must be specified and" - " positive in memory_data_param"; - vector label_shape(1, batch_size_); - top[0]->Reshape(batch_size_, channels_, height_, width_); - top[1]->Reshape(label_shape); - added_data_.Reshape(batch_size_, channels_, height_, width_); - added_label_.Reshape(label_shape); - data_ = NULL; - labels_ = NULL; - added_data_.cpu_data(); - added_label_.cpu_data(); + const vector*>& top) { + batch_size_ = this->layer_param_.memory_data_param().batch_size(); + channels_ = this->layer_param_.memory_data_param().channels(); + height_ = this->layer_param_.memory_data_param().height(); + width_ = this->layer_param_.memory_data_param().width(); + size_ = channels_ * height_ * width_; + CHECK_GT(batch_size_ * size_, 0) << + "batch_size, channels, height, and width must be specified and" + " positive in memory_data_param"; + vector label_shape(1, batch_size_); + top[0]->Reshape(batch_size_, channels_, height_, width_); + top[1]->Reshape(label_shape); + added_data_.Reshape(batch_size_, channels_, height_, width_); + added_label_.Reshape(label_shape); + data_ = NULL; + labels_ = NULL; + added_data_.cpu_data(); + added_label_.cpu_data(); } -template +template void MemoryDataLayer::AddDatumVector(const vector& datum_vector) { - CHECK(!has_new_data_) << - "Can't add data until current data has been consumed."; - size_t num = datum_vector.size(); - CHECK_GT(num, 0) << "There is no datum to add."; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; - added_data_.Reshape(num, channels_, height_, width_); - added_label_.Reshape(num, 1, 1, 1); - // Apply data transformations (mirror, scale, crop...) - this->data_transformer_->Transform(datum_vector, &added_data_); - // Copy Labels - Dtype* top_label = added_label_.mutable_cpu_data(); - for (int item_id = 0; item_id < num; ++item_id) { - top_label[item_id] = datum_vector[item_id].label(); - } - // num_images == batch_size_ - Dtype* top_data = added_data_.mutable_cpu_data(); - Reset(top_data, top_label, num); - has_new_data_ = true; + CHECK(!has_new_data_) << + "Can't add data until current data has been consumed."; + size_t num = datum_vector.size(); + CHECK_GT(num, 0) << "There is no datum to add."; + CHECK_EQ(num % batch_size_, 0) << + "The added data must be a multiple of the batch size."; + added_data_.Reshape(num, channels_, height_, width_); + added_label_.Reshape(num, 1, 1, 1); + // Apply data transformations (mirror, scale, crop...) + this->data_transformer_->Transform(datum_vector, &added_data_); + // Copy Labels + Dtype* top_label = added_label_.mutable_cpu_data(); + for (int item_id = 0; item_id < num; ++item_id) { + top_label[item_id] = datum_vector[item_id].label(); + } + // num_images == batch_size_ + Dtype* top_data = added_data_.mutable_cpu_data(); + Reset(top_data, top_label, num); + has_new_data_ = true; } -template +template void MemoryDataLayer::AddMatVector(const vector& mat_vector, - const vector& labels) { - size_t num = mat_vector.size(); - CHECK(!has_new_data_) << - "Can't add mat until current data has been consumed."; - CHECK_GT(num, 0) << "There is no mat to add"; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; - added_data_.Reshape(num, channels_, height_, width_); - added_label_.Reshape(num, 1, 1, 1); - // Apply data transformations (mirror, scale, crop...) - this->data_transformer_->Transform(mat_vector, &added_data_); - // Copy Labels - Dtype* top_label = added_label_.mutable_cpu_data(); - for (int item_id = 0; item_id < num; ++item_id) { - top_label[item_id] = labels[item_id]; - } - // num_images == batch_size_ - Dtype* top_data = added_data_.mutable_cpu_data(); - Reset(top_data, top_label, num); - has_new_data_ = true; + const vector& labels) { + size_t num = mat_vector.size(); + CHECK(!has_new_data_) << + "Can't add mat until current data has been consumed."; + CHECK_GT(num, 0) << "There is no mat to add"; + CHECK_EQ(num % batch_size_, 0) << + "The added data must be a multiple of the batch size."; + added_data_.Reshape(num, channels_, height_, width_); + added_label_.Reshape(num, 1, 1, 1); + // Apply data transformations (mirror, scale, crop...) + this->data_transformer_->Transform(mat_vector, &added_data_); + // Copy Labels + Dtype* top_label = added_label_.mutable_cpu_data(); + for (int item_id = 0; item_id < num; ++item_id) { + top_label[item_id] = labels[item_id]; + } + // num_images == batch_size_ + Dtype* top_data = added_data_.mutable_cpu_data(); + Reset(top_data, top_label, num); + has_new_data_ = true; } -template +template void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { - CHECK(data); - CHECK(labels); - CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size"; - // Warn with transformation parameters since a memory array is meant to - // be generic and no transformations are done with Reset(). - if (this->layer_param_.has_transform_param()) { - LOG(WARNING) << this->type() << " does not transform array data on Reset()"; - } - data_ = data; - labels_ = labels; - n_ = n; - pos_ = 0; + CHECK(data); + CHECK(labels); + CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size"; + // Warn with transformation parameters since a memory array is meant to + // be generic and no transformations are done with Reset(). + if (this->layer_param_.has_transform_param()) { + LOG(WARNING) << this->type() << " does not transform array data on Reset()"; + } + data_ = data; + labels_ = labels; + n_ = n; + pos_ = 0; } -template +template void MemoryDataLayer::set_batch_size(int new_size) { - CHECK(!has_new_data_) << - "Can't change batch_size until current data has been consumed."; - batch_size_ = new_size; - added_data_.Reshape(batch_size_, channels_, height_, width_); - added_label_.Reshape(batch_size_, 1, 1, 1); + CHECK(!has_new_data_) << + "Can't change batch_size until current data has been consumed."; + batch_size_ = new_size; + added_data_.Reshape(batch_size_, channels_, height_, width_); + added_label_.Reshape(batch_size_, 1, 1, 1); } -template +template void MemoryDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; - top[0]->Reshape(batch_size_, channels_, height_, width_); - top[1]->Reshape(batch_size_, 1, 1, 1); - top[0]->set_cpu_data(data_ + pos_ * size_); - top[1]->set_cpu_data(labels_ + pos_); - pos_ = (pos_ + batch_size_) % n_; - if (pos_ == 0) - has_new_data_ = false; + const vector*>& top) { + CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; + top[0]->Reshape(batch_size_, channels_, height_, width_); + top[1]->Reshape(batch_size_, 1, 1, 1); + top[0]->set_cpu_data(data_ + pos_ * size_); + top[1]->set_cpu_data(labels_ + pos_); + pos_ = (pos_ + batch_size_) % n_; + if (pos_ == 0) + has_new_data_ = false; } -INSTANTIATE_CLASS(MemoryDataLayer); -REGISTER_LAYER_CLASS(MemoryData); +INSTANTIATE_CLASS (MemoryDataLayer); +REGISTER_LAYER_CLASS (MemoryData); } // namespace caffe diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 4267a594..5e57cf85 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -10,58 +10,58 @@ namespace caffe { -template +template void MultinomialLogisticLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - CHECK_EQ(bottom[1]->channels(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[1]->channels(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); } -template +template void MultinomialLogisticLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - Dtype loss = 0; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); - loss -= log(prob); - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + Dtype loss = 0; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + Dtype prob = std::max( + bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + loss -= log(prob); + } + top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void MultinomialLogisticLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); - const Dtype scale = - top[0]->cpu_diff()[0] / num; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); - bottom_diff[i * dim + label] = scale / prob; - } - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); + const Dtype scale = -top[0]->cpu_diff()[0] / num; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + Dtype prob = std::max( + bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + bottom_diff[i * dim + label] = scale / prob; + } + } } -INSTANTIATE_CLASS(MultinomialLogisticLossLayer); -REGISTER_LAYER_CLASS(MultinomialLogisticLoss); +INSTANTIATE_CLASS (MultinomialLogisticLossLayer); +REGISTER_LAYER_CLASS (MultinomialLogisticLoss); } // namespace caffe diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index cbeeb150..0bd4e989 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -7,253 +7,254 @@ namespace caffe { -template +template void MVNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - sum_multiplier_.Reshape(1, 1, - bottom[0]->height(), bottom[0]->width()); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); - caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); - eps_ = this->layer_param_.mvn_param().eps(); + const vector*>& top) { + top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), + 1, 1); + variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), + 1, 1); + temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + sum_multiplier_.Reshape(1, 1, + bottom[0]->height(), bottom[0]->width()); + Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); + eps_ = this->layer_param_.mvn_param().eps(); } -template +template void MVNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), - sum_multiplier_.cpu_data(), 0., - variance_.mutable_cpu_data()); // E(X^2) - caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), - temp_.mutable_cpu_data()); // (EX)^2 - caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), - variance_.mutable_cpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); - - // normalize variance - caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), - variance_.mutable_cpu_data()); - - caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); - - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); - } else { - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX - - // subtract mean - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_cpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), + sum_multiplier_.cpu_data(), 0., + variance_.mutable_cpu_data()); // E(X^2) + caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), + temp_.mutable_cpu_data()); // (EX)^2 + caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), + variance_.mutable_cpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); + + caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); + + // normalize variance + caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), + variance_.mutable_cpu_data()); + + caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); + + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); + + caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); + } else { + caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + + // subtract mean + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); + + caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); + } } -template +template void MVNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_cpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - bottom_diff); - caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_cpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., - bottom_diff); - - caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); - } else { - caffe_copy(temp_.count(), top_diff, bottom_diff); - } + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + bottom_diff); + caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., + bottom_diff); + + caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); + + caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); + } else { + caffe_copy(temp_.count(), top_diff, bottom_diff); + } } -template +template void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) - caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 - caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), + sum_multiplier_.gpu_data(), 0., + variance_.mutable_gpu_data()); // E(X^2) + caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), + temp_.mutable_gpu_data()); // (EX)^2 + caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), + variance_.mutable_gpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + + // normalize variance + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); + } else { + caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + + // subtract mean + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + } } -template +template void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + bottom_diff); + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., + bottom_diff); + + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); + } else { + caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff, + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); + } } - #ifdef CPU_ONLY STUB_GPU(MVNLayer); #endif -INSTANTIATE_CLASS(MVNLayer); -REGISTER_LAYER_CLASS(MVN); +INSTANTIATE_CLASS (MVNLayer); +REGISTER_LAYER_CLASS (MVN); } // namespace caffe diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp index ba67b438..2a0a2088 100644 --- a/src/caffe/layers/neuron_layer.cpp +++ b/src/caffe/layers/neuron_layer.cpp @@ -5,12 +5,12 @@ namespace caffe { -template +template void NeuronLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - top[0]->ReshapeLike(*bottom[0]); + const vector*>& top) { + top[0]->ReshapeLike(*bottom[0]); } -INSTANTIATE_CLASS(NeuronLayer); +INSTANTIATE_CLASS (NeuronLayer); } // namespace caffe diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index ff86400b..d66a24f6 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -13,405 +13,404 @@ namespace caffe { using std::min; using std::max; -template +template void PoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - PoolingParameter pool_param = this->layer_param_.pooling_param(); - if (pool_param.global_pooling()) { - CHECK(!(pool_param.has_kernel_size() || - pool_param.has_kernel_h() || pool_param.has_kernel_w())) - << "With Global_pooling: true Filter size cannot specified"; - } else { - CHECK(!pool_param.has_kernel_size() != - !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(pool_param.has_kernel_size() || - (pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - } - CHECK((!pool_param.has_pad() && pool_param.has_pad_h() - && pool_param.has_pad_w()) - || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!pool_param.has_stride() && pool_param.has_stride_h() - && pool_param.has_stride_w()) - || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - global_pooling_ = pool_param.global_pooling(); - if (global_pooling_) { - kernel_h_ = bottom[0]->height(); - kernel_w_ = bottom[0]->width(); - } else { - if (pool_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = pool_param.kernel_size(); - } else { - kernel_h_ = pool_param.kernel_h(); - kernel_w_ = pool_param.kernel_w(); - } - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!pool_param.has_pad_h()) { - pad_h_ = pad_w_ = pool_param.pad(); - } else { - pad_h_ = pool_param.pad_h(); - pad_w_ = pool_param.pad_w(); - } - if (!pool_param.has_stride_h()) { - stride_h_ = stride_w_ = pool_param.stride(); - } else { - stride_h_ = pool_param.stride_h(); - stride_w_ = pool_param.stride_w(); - } - if (global_pooling_) { - CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_AVE - || this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_MAX) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_h_, kernel_h_); - CHECK_LT(pad_w_, kernel_w_); - } + const vector*>& top) { + PoolingParameter pool_param = this->layer_param_.pooling_param(); + if (pool_param.global_pooling()) { + CHECK(!(pool_param.has_kernel_size() || + pool_param.has_kernel_h() || pool_param.has_kernel_w())) + << "With Global_pooling: true Filter size cannot specified"; + } else { + CHECK(!pool_param.has_kernel_size() != + !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK(pool_param.has_kernel_size() || + (pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + } + CHECK((!pool_param.has_pad() && pool_param.has_pad_h() + && pool_param.has_pad_w()) + || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK((!pool_param.has_stride() && pool_param.has_stride_h() + && pool_param.has_stride_w()) + || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + global_pooling_ = pool_param.global_pooling(); + if (global_pooling_) { + kernel_h_ = bottom[0]->height(); + kernel_w_ = bottom[0]->width(); + } else { + if (pool_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = pool_param.kernel_size(); + } else { + kernel_h_ = pool_param.kernel_h(); + kernel_w_ = pool_param.kernel_w(); + } + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!pool_param.has_pad_h()) { + pad_h_ = pad_w_ = pool_param.pad(); + } else { + pad_h_ = pool_param.pad_h(); + pad_w_ = pool_param.pad_w(); + } + if (!pool_param.has_stride_h()) { + stride_h_ = stride_w_ = pool_param.stride(); + } else { + stride_h_ = pool_param.stride_h(); + stride_w_ = pool_param.stride_w(); + } + if (global_pooling_) { + CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + if (pad_h_ != 0 || pad_w_ != 0) { + CHECK(this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_AVE + || this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_h_, kernel_h_); + CHECK_LT(pad_w_, kernel_w_); + } } - -template +template void PoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - if (global_pooling_) { - kernel_h_ = bottom[0]->height(); - kernel_w_ = bottom[0]->width(); - } - pooled_height_ = static_cast(ceil(static_cast( - height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; - pooled_width_ = static_cast(ceil(static_cast( - width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; - if (pad_h_ || pad_w_) { - // If we have padding, ensure that the last pooling starts strictly - // inside the image (instead of at the padding); otherwise clip the last. - if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) { - --pooled_height_; - } - if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) { - --pooled_width_; - } - CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_); - CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); - } - top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - if (top.size() > 1) { - top[1]->ReshapeLike(*top[0]); - } - // If max pooling, we will initialize the vector index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX && top.size() == 1) { - max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - } - // If stochastic pooling, we will initialize the random index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_STOCHASTIC) { - rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - } + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + if (global_pooling_) { + kernel_h_ = bottom[0]->height(); + kernel_w_ = bottom[0]->width(); + } + pooled_height_ = static_cast(ceil(static_cast( + height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; + pooled_width_ = static_cast(ceil(static_cast( + width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; + if (pad_h_ || pad_w_) { + // If we have padding, ensure that the last pooling starts strictly + // inside the image (instead of at the padding); otherwise clip the last. + if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) { + --pooled_height_; + } + if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) { + --pooled_width_; + } + CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_); + CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); + } + top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, + pooled_width_); + if (top.size() > 1) { + top[1]->ReshapeLike(*top[0]); + } + // If max pooling, we will initialize the vector index part. + if (this->layer_param_.pooling_param().pool() == + PoolingParameter_PoolMethod_MAX && top.size() == 1) { + max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, + pooled_width_); + } + // If stochastic pooling, we will initialize the random index part. + if (this->layer_param_.pooling_param().pool() == + PoolingParameter_PoolMethod_STOCHASTIC) { + rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, + pooled_width_); + } } // TODO(Yangqing): Is there a faster way to do pooling in the channel-first // case? -template +template void PoolingLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int top_count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; // suppress warnings about uninitalized variables - Dtype* top_mask = NULL; - // Different pooling methods. We explicitly do the switch outside the for - // loop to save time, although this results in more code. - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - // Initialize - if (use_top_mask) { - top_mask = top[1]->mutable_cpu_data(); - caffe_set(top_count, Dtype(-1), top_mask); - } else { - mask = max_idx_.mutable_cpu_data(); - caffe_set(top_count, -1, mask); - } - caffe_set(top_count, Dtype(-FLT_MAX), top_data); - // The main loop - for (int n = 0; n < bottom[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_); - int wend = min(wstart + kernel_w_, width_); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - const int pool_index = ph * pooled_width_ + pw; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = h * width_ + w; - if (bottom_data[index] > top_data[pool_index]) { - top_data[pool_index] = bottom_data[index]; - if (use_top_mask) { - top_mask[pool_index] = static_cast(index); - } else { - mask[pool_index] = index; - } - } - } - } - } - } - // compute offset - bottom_data += bottom[0]->offset(0, 1); - top_data += top[0]->offset(0, 1); - if (use_top_mask) { - top_mask += top[0]->offset(0, 1); - } else { - mask += top[0]->offset(0, 1); - } - } - } - break; - case PoolingParameter_PoolMethod_AVE: - for (int i = 0; i < top_count; ++i) { - top_data[i] = 0; - } - // The main loop - for (int n = 0; n < bottom[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_ + pad_h_); - int wend = min(wstart + kernel_w_, width_ + pad_w_); - int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height_); - wend = min(wend, width_); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - top_data[ph * pooled_width_ + pw] += - bottom_data[h * width_ + w]; - } - } - top_data[ph * pooled_width_ + pw] /= pool_size; - } - } - // compute offset - bottom_data += bottom[0]->offset(0, 1); - top_data += top[0]->offset(0, 1); - } - } - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - NOT_IMPLEMENTED; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int top_count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; // suppress warnings about uninitalized variables + Dtype* top_mask = NULL; + // Different pooling methods. We explicitly do the switch outside the for + // loop to save time, although this results in more code. + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + // Initialize + if (use_top_mask) { + top_mask = top[1]->mutable_cpu_data(); + caffe_set(top_count, Dtype(-1), top_mask); + } else { + mask = max_idx_.mutable_cpu_data(); + caffe_set(top_count, -1, mask); + } + caffe_set(top_count, Dtype(-FLT_MAX), top_data); + // The main loop + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_); + int wend = min(wstart + kernel_w_, width_); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const int pool_index = ph * pooled_width_ + pw; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width_ + w; + if (bottom_data[index] > top_data[pool_index]) { + top_data[pool_index] = bottom_data[index]; + if (use_top_mask) { + top_mask[pool_index] = static_cast(index); + } else { + mask[pool_index] = index; + } + } + } + } + } + } + // compute offset + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + if (use_top_mask) { + top_mask += top[0]->offset(0, 1); + } else { + mask += top[0]->offset(0, 1); + } + } + } + break; + case PoolingParameter_PoolMethod_AVE: + for (int i = 0; i < top_count; ++i) { + top_data[i] = 0; + } + // The main loop + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_ + pad_h_); + int wend = min(wstart + kernel_w_, width_ + pad_w_); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height_); + wend = min(wend, width_); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + top_data[ph * pooled_width_ + pw] += + bottom_data[h * width_ + w]; + } + } + top_data[ph * pooled_width_ + pw] /= pool_size; + } + } + // compute offset + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + } + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + NOT_IMPLEMENTED; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } -template +template void PoolingLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - // Different pooling methods. We explicitly do the switch outside the for - // loop to save time, although this results in more codes. - caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; // suppress warnings about uninitialized variables - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - // The main loop - if (use_top_mask) { - top_mask = top[1]->cpu_data(); - } else { - mask = max_idx_.cpu_data(); - } - for (int n = 0; n < top[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - const int index = ph * pooled_width_ + pw; - const int bottom_index = - use_top_mask ? top_mask[index] : mask[index]; - bottom_diff[bottom_index] += top_diff[index]; - } - } - bottom_diff += bottom[0]->offset(0, 1); - top_diff += top[0]->offset(0, 1); - if (use_top_mask) { - top_mask += top[0]->offset(0, 1); - } else { - mask += top[0]->offset(0, 1); - } - } - } - break; - case PoolingParameter_PoolMethod_AVE: - // The main loop - for (int n = 0; n < top[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_ + pad_h_); - int wend = min(wstart + kernel_w_, width_ + pad_w_); - int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height_); - wend = min(wend, width_); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - bottom_diff[h * width_ + w] += - top_diff[ph * pooled_width_ + pw] / pool_size; - } - } - } - } - // offset - bottom_diff += bottom[0]->offset(0, 1); - top_diff += top[0]->offset(0, 1); - } - } - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - NOT_IMPLEMENTED; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + // Different pooling methods. We explicitly do the switch outside the for + // loop to save time, although this results in more codes. + caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; // suppress warnings about uninitialized variables + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + // The main loop + if (use_top_mask) { + top_mask = top[1]->cpu_data(); + } else { + mask = max_idx_.cpu_data(); + } + for (int n = 0; n < top[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + const int index = ph * pooled_width_ + pw; + const int bottom_index = + use_top_mask ? top_mask[index] : mask[index]; + bottom_diff[bottom_index] += top_diff[index]; + } + } + bottom_diff += bottom[0]->offset(0, 1); + top_diff += top[0]->offset(0, 1); + if (use_top_mask) { + top_mask += top[0]->offset(0, 1); + } else { + mask += top[0]->offset(0, 1); + } + } + } + break; + case PoolingParameter_PoolMethod_AVE: + // The main loop + for (int n = 0; n < top[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_ + pad_h_); + int wend = min(wstart + kernel_w_, width_ + pad_w_); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height_); + wend = min(wend, width_); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + bottom_diff[h * width_ + w] += + top_diff[ph * pooled_width_ + pw] / pool_size; + } + } + } + } + // offset + bottom_diff += bottom[0]->offset(0, 1); + top_diff += top[0]->offset(0, 1); + } + } + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + NOT_IMPLEMENTED; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } -template +template void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - //Forward_cpu(bottom, top); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; - Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->mutable_gpu_data(); - } else { - mask = max_idx_.mutable_gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - if (this->phase_ == TRAIN) { - // We need to create the random index as well. - caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); - } else { - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); - } - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector*>& top) { + //Forward_cpu(bottom, top); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; + Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->mutable_gpu_data(); + } else { + mask = max_idx_.mutable_gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, + mask, top_mask); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + if (this->phase_ == TRAIN) { + // We need to create the random index as well. + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, + rand_idx_.mutable_gpu_data(), top_data); + } else { + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, top_data); + } + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } -template +template void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - //Backward_cpu(top, propagate_down, bottom); - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - caffe_gpu_set(count, Dtype(0.), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->gpu_data(); - } else { - mask = max_idx_.gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward(count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector& propagate_down, const vector*>& bottom) { + //Backward_cpu(top, propagate_down, bottom); + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + caffe_gpu_set(count, Dtype(0.), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->gpu_data(); + } else { + mask = max_idx_.gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, + kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, + bottom_diff); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolBackward(count, top_diff, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, + top[0]->num(), channels_, height_, width_, pooled_height_, + pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, + bottom_diff); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } #ifdef CPU_ONLY STUB_GPU(PoolingLayer); #endif -INSTANTIATE_CLASS(PoolingLayer); +INSTANTIATE_CLASS (PoolingLayer); } // namespace caffe diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index d3c374f1..e4a3e456 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -7,175 +7,173 @@ #include "caffe/util/ocl_util.hpp" #include "caffe/util/ocl_wrapper.hpp" - namespace caffe { -template +template void PowerLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - power_ = this->layer_param_.power_param().power(); - scale_ = this->layer_param_.power_param().scale(); - shift_ = this->layer_param_.power_param().shift(); - diff_scale_ = power_ * scale_; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + power_ = this->layer_param_.power_param().power(); + scale_ = this->layer_param_.power_param().scale(); + shift_ = this->layer_param_.power_param().shift(); + diff_scale_ = power_ * scale_; } - // Compute y = (shift + scale * x)^power -template +template void PowerLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->cpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_powx(count, top_data, power_, top_data); - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + caffe_set(count, value, top_data); + return; + } + const Dtype* bottom_data = bottom[0]->cpu_data(); + caffe_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_powx(count, top_data, power_, top_data); + } } -template +template void PowerLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->cpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_scal(count, diff_scale_, bottom_diff); - } - } - } - if (diff_scale_ != Dtype(0)) { - caffe_mul(count, top_diff, bottom_diff, bottom_diff); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->cpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + caffe_set(count, diff_scale_, bottom_diff); + } else { + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, + Dtype(0), bottom_diff); + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->cpu_data(); + caffe_div(count, top_data, bottom_data, bottom_diff); + caffe_scal(count, power_, bottom_diff); + } else { + caffe_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->cpu_data(); + caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_scal(count, diff_scale_, bottom_diff); + } + } + } + if (diff_scale_ != Dtype(0)) { + caffe_mul(count, top_diff, bottom_diff, bottom_diff); + } + } } -template +template void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - ocl_memset(top_data, value, count); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_gpu_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + ocl_memset(top_data, value, count); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + caffe_gpu_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_gpu_powx(count, top_data, power_, top_data); + } } -template +template void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - ocl_memset(bottom_diff, diff_scale_,count); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + ocl_memset(bottom_diff, diff_scale_, count); + } else { + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, + Dtype(0), bottom_diff); + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + caffe_gpu_scal(count, power_, bottom_diff); + } else { + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_gpu_scal(count, diff_scale_, bottom_diff); + } + } + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU(PowerLayer); #endif -INSTANTIATE_CLASS(PowerLayer); -REGISTER_LAYER_CLASS(Power); +INSTANTIATE_CLASS (PowerLayer); +REGISTER_LAYER_CLASS (Power); } // namespace caffe diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 426a0cad..5332a178 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -7,203 +7,205 @@ namespace caffe { -template +template void PReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; - PReLUParameter prelu_param = this->layer_param().prelu_param(); - int channels = bottom[0]->channels(); - channel_shared_ = prelu_param.channel_shared(); - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - this->blobs_.resize(1); - if (channel_shared_) { - this->blobs_[0].reset(new Blob(vector(0))); - } else { - this->blobs_[0].reset(new Blob(vector(1, channels))); - } - shared_ptr > filler; - if (prelu_param.has_filler()) { - filler.reset(GetFiller(prelu_param.filler())); - } else { - FillerParameter filler_param; - filler_param.set_type("constant"); - filler_param.set_value(0.25); - filler.reset(GetFiller(filler_param)); - } - filler->Fill(this->blobs_[0].get()); - } - if (channel_shared_) { - CHECK_EQ(this->blobs_[0]->count(), 1) - << "Negative slope size is inconsistent with prototxt config"; - } else { - CHECK_EQ(this->blobs_[0]->count(), channels) - << "Negative slope size is inconsistent with prototxt config"; - } - - // Propagate gradients to the parameters (as directed by backward pass). - this->param_propagate_down_.resize(this->blobs_.size(), true); - multiplier_.Reshape(vector(1, bottom[0]->count(1))); - backward_buff_.Reshape(vector(1, bottom[0]->count(1))); - caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data()); + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "Number of axes of bottom blob must be >=2."; + PReLUParameter prelu_param = this->layer_param().prelu_param(); + int channels = bottom[0]->channels(); + channel_shared_ = prelu_param.channel_shared(); + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + this->blobs_.resize(1); + if (channel_shared_) { + this->blobs_[0].reset(new Blob(vector(0))); + } else { + this->blobs_[0].reset(new Blob(vector(1, channels))); + } + shared_ptr < Filler > filler; + if (prelu_param.has_filler()) { + filler.reset(GetFiller < Dtype > (prelu_param.filler())); + } else { + FillerParameter filler_param; + filler_param.set_type("constant"); + filler_param.set_value(0.25); + filler.reset(GetFiller < Dtype > (filler_param)); + } + filler->Fill(this->blobs_[0].get()); + } + if (channel_shared_) { + CHECK_EQ(this->blobs_[0]->count(), 1) + << "Negative slope size is inconsistent with prototxt config"; + } else { + CHECK_EQ(this->blobs_[0]->count(), channels) + << "Negative slope size is inconsistent with prototxt config"; + } + + // Propagate gradients to the parameters (as directed by backward pass). + this->param_propagate_down_.resize(this->blobs_.size(), true); + multiplier_.Reshape(vector(1, bottom[0]->count(1))); + backward_buff_.Reshape(vector(1, bottom[0]->count(1))); + caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data()); } -template +template void PReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; - top[0]->ReshapeLike(*bottom[0]); - if (bottom[0] == top[0]) { - // For in-place computation - bottom_memory_.ReshapeLike(*bottom[0]); - } + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "Number of axes of bottom blob must be >=2."; + top[0]->ReshapeLike(*bottom[0]); + if (bottom[0] == top[0]) { + // For in-place computation + bottom_memory_.ReshapeLike(*bottom[0]); + } } -template +template void PReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->cpu_data(); - - // For in-place computation - if (bottom[0] == top[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data()); - } - - // if channel_shared, channel index in the following computation becomes - // always zero. - const int div_factor = channel_shared_ ? channels : 1; - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - top_data[i] = std::max(bottom_data[i], Dtype(0)) - + slope_data[c] * std::min(bottom_data[i], Dtype(0)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->cpu_data(); + + // For in-place computation + if (bottom[0] == top[0]) { + caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data()); + } + + // if channel_shared, channel index in the following computation becomes + // always zero. + const int div_factor = channel_shared_ ? channels : 1; + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + top_data[i] = std::max(bottom_data[i], Dtype(0)) + + slope_data[c] * std::min(bottom_data[i], Dtype(0)); + } } -template +template void PReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* slope_data = this->blobs_[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - // For in-place computation - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.cpu_data(); - } - - // if channel_shared, channel index in the following computation becomes - // always zero. - const int div_factor = channel_shared_ ? channels : 1; - - // Propagte to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + slope_data[c] * (bottom_data[i] <= 0)); - } - } + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* slope_data = this->blobs_[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + + // For in-place computation + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.cpu_data(); + } + + // if channel_shared, channel index in the following computation becomes + // always zero. + const int div_factor = channel_shared_ ? channels : 1; + + // Propagte to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) + + slope_data[c] * (bottom_data[i] <= 0)); + } + } } -template +template void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - const int div_factor = channel_shared_ ? channels : 1; - - if (top[0] == bottom[0]) { - caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); - } - PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + const int div_factor = channel_shared_ ? channels : 1; + + if (top[0] == bottom[0]) { + caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + } + PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, + div_factor); } -template +template void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.gpu_data(); - } - - // Propagate to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); - int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward( - cdim, top_diff, top[0]->offset(n), - bottom_data, bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - if (channel_shared_) { - Dtype d; - caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } - if (channel_shared_) { - caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - int div_factor = channel_shared_ ? channels : 1; - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data, - div_factor); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.gpu_data(); + } + + // Propagate to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); + int cdim = channels * dim; + Dtype dsum = 0.; + for (int n = 0; n < bottom[0]->num(); ++n) { + // compute element-wise diff + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUParamBackward( + cdim, top_diff, top[0]->offset(n), + bottom_data, bottom[0]->offset(n), + backward_buff_.mutable_gpu_diff()); + if (channel_shared_) { + Dtype d; + caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(), + multiplier_.gpu_data(), &d); + dsum += d; + } else { + caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1., + backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., + slope_diff); + } + } + if (channel_shared_) { + caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + int div_factor = channel_shared_ ? channels : 1; + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, + slope_data, + div_factor); + } } #ifdef CPU_ONLY STUB_GPU(PReLULayer); #endif -INSTANTIATE_CLASS(PReLULayer); -REGISTER_LAYER_CLASS(PReLU); +INSTANTIATE_CLASS (PReLULayer); +REGISTER_LAYER_CLASS (PReLU); } // namespace caffe diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 4003ddd1..32ea4bc0 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -8,206 +8,210 @@ namespace caffe { -template +template void ReductionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - op_ = this->layer_param_.reduction_param().operation(); + const vector*>& top) { + op_ = this->layer_param_.reduction_param().operation(); } -template +template void ReductionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - axis_ = bottom[0]->CanonicalAxisIndex( - this->layer_param_.reduction_param().axis()); - // In the output, we'll keep all axes up to the reduction axis, but - // throw away any after that. - // Note: currently reducing along non-tail axes is not supported; otherwise, - // we'd need to also copy any axes following an "end_axis". - vector top_shape(bottom[0]->shape().begin(), - bottom[0]->shape().begin() + axis_); - top[0]->Reshape(top_shape); - num_ = bottom[0]->count(0, axis_); - dim_ = bottom[0]->count(axis_); - CHECK_EQ(num_, top[0]->count()); - if (op_ == ReductionParameter_ReductionOp_SUM || - op_ == ReductionParameter_ReductionOp_MEAN) { - vector sum_mult_shape(1, dim_); - sum_multiplier_.Reshape(sum_mult_shape); - caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); - } - coeff_ = this->layer_param().reduction_param().coeff(); - if (op_ == ReductionParameter_ReductionOp_MEAN) { - coeff_ /= dim_; - } + const vector*>& top) { + axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.reduction_param().axis()); + // In the output, we'll keep all axes up to the reduction axis, but + // throw away any after that. + // Note: currently reducing along non-tail axes is not supported; otherwise, + // we'd need to also copy any axes following an "end_axis". + vector top_shape(bottom[0]->shape().begin(), + bottom[0]->shape().begin() + axis_); + top[0]->Reshape(top_shape); + num_ = bottom[0]->count(0, axis_); + dim_ = bottom[0]->count(axis_); + CHECK_EQ(num_, top[0]->count()); + if (op_ == ReductionParameter_ReductionOp_SUM || + op_ == ReductionParameter_ReductionOp_MEAN) { + vector sum_mult_shape(1, dim_); + sum_multiplier_.Reshape(sum_mult_shape); + caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); + } + coeff_ = this->layer_param().reduction_param().coeff(); + if (op_ == ReductionParameter_ReductionOp_MEAN) { + coeff_ /= dim_; + } } -template +template void ReductionLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.cpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data); - break; - case ReductionParameter_ReductionOp_ASUM: - *top_data = caffe_cpu_asum(dim_, bottom_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_cpu_data(); - caffe_scal(num_, coeff_, top_data); - } + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.cpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data); + break; + case ReductionParameter_ReductionOp_ASUM: + *top_data = caffe_cpu_asum(dim_, bottom_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_cpu_data(); + caffe_scal(num_, coeff_, top_data); + } } -template +template void ReductionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->cpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_cpu_sign(dim_, bottom_data, bottom_diff); - caffe_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->cpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_set(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_cpu_sign(dim_, bottom_data, bottom_diff); + caffe_scal(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + bottom_diff += dim_; + ++top_diff; + } } -template +template void ReductionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.gpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_asum(dim_, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_gpu_data(); + caffe_gpu_scal(num_, coeff_, top_data); + } } -template +template void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (!propagate_down[0]) { return; } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->gpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->gpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_set(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_sign(dim_, bottom_data, bottom_diff); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + bottom_diff += dim_; + ++top_diff; + } } #ifdef CPU_ONLY STUB_GPU(ReductionLayer); #endif -INSTANTIATE_CLASS(ReductionLayer); -REGISTER_LAYER_CLASS(Reduction); +INSTANTIATE_CLASS (ReductionLayer); +REGISTER_LAYER_CLASS (Reduction); } // namespace caffe diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index c29d5baa..7f3b2729 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -5,67 +5,64 @@ #include "caffe/vision_layers.hpp" namespace caffe { -template +template void ReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - for (int i = 0; i < count; ++i) { - top_data[i] = std::max(bottom_data[i], Dtype(0)) - + negative_slope * std::min(bottom_data[i], Dtype(0)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + for (int i = 0; i < count; ++i) { + top_data[i] = std::max(bottom_data[i], Dtype(0)) + + negative_slope * std::min(bottom_data[i], Dtype(0)); + } } -template +template void ReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - for (int i = 0; i < count; ++i) { - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + negative_slope * (bottom_data[i] <= 0)); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + for (int i = 0; i < count; ++i) { + bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) + + negative_slope * (bottom_data[i] <= 0)); + } + } } - -template +template void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUForward(count,bottom_data,top_data,negative_slope); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUForward(count, bottom_data, top_data, negative_slope); } - -template +template void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope); - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope); + } } - #ifdef CPU_ONLY STUB_GPU(ReLULayer); #endif -INSTANTIATE_CLASS(ReLULayer); +INSTANTIATE_CLASS (ReLULayer); } // namespace caffe diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp index ffe970f2..8dbbbcb0 100644 --- a/src/caffe/layers/reshape_layer.cpp +++ b/src/caffe/layers/reshape_layer.cpp @@ -5,91 +5,92 @@ namespace caffe { -template +template void ReshapeLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - inferred_axis_ = -1; - copy_axes_.clear(); - const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); - const int top_num_axes = top_blob_shape.dim_size(); - constant_count_ = 1; - for (int i = 0; i < top_num_axes; ++i) { - const int top_dim = top_blob_shape.dim(i); - if (top_dim == 0) { - copy_axes_.push_back(i); - } else if (top_dim == -1) { - CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple " - << "-1 dims; at most a single (1) value of -1 may be specified"; - inferred_axis_ = i; - } else { - constant_count_ *= top_dim; - } - } + const vector*>& top) { + inferred_axis_ = -1; + copy_axes_.clear(); + const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); + const int top_num_axes = top_blob_shape.dim_size(); + constant_count_ = 1; + for (int i = 0; i < top_num_axes; ++i) { + const int top_dim = top_blob_shape.dim(i); + if (top_dim == 0) { + copy_axes_.push_back(i); + } else if (top_dim == -1) { + CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple " + << "-1 dims; at most a single (1) value of -1 may be specified"; + inferred_axis_ = i; + } else { + constant_count_ *= top_dim; + } + } } -template +template void ReshapeLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int input_start_axis = this->layer_param_.reshape_param().axis(); - const int start_axis = (input_start_axis >= 0) ? input_start_axis : - bottom[0]->num_axes() + input_start_axis + 1; - CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; - CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis - << " out of range for " << bottom[0]->num_axes() << "-D input blob"; - const int num_axes = this->layer_param_.reshape_param().num_axes(); - CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all"; - const int end_axis = - (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); - CHECK_LE(end_axis, bottom[0]->num_axes()) - << "end_axis = axis + num_axes is out of range"; - const int num_axes_replaced = end_axis - start_axis; - const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced; - const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); - const int num_new_axes = top_blob_shape.dim_size(); - vector top_shape(num_axes_retained + num_new_axes); - int top_shape_index = 0; - for (int i = 0; i < start_axis; ++i) { - top_shape[top_shape_index++] = bottom[0]->shape(i); - } - for (int i = 0; i < num_new_axes; ++i) { - top_shape[top_shape_index++] = top_blob_shape.dim(i); - } - for (int i = end_axis; i < bottom[0]->num_axes(); ++i) { - top_shape[top_shape_index++] = bottom[0]->shape(i); - } - CHECK_EQ(top_shape_index, top_shape.size()); - for (int i = 0; i < copy_axes_.size(); ++i) { - const int copy_axis_index = copy_axes_[i]; - CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) - << "new shape contains a 0, but there was no corresponding bottom axis " - << "to copy"; - top_shape[start_axis + copy_axis_index] = - bottom[0]->shape(start_axis + copy_axis_index); - } - if (inferred_axis_ >= 0) { - // A -1 dim was specified; infer the correct dimension by computing the - // product of the other dimensions. - int explicit_count = constant_count_; - explicit_count *= bottom[0]->count(0, start_axis); - explicit_count *= bottom[0]->count(end_axis); - for (int i = 0; i < copy_axes_.size(); ++i) { - const int copy_axis_index = copy_axes_[i]; - explicit_count *= top_shape[start_axis + copy_axis_index]; - } - CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count (" - << bottom[0]->count() << ") must be divisible by the product of " - << "the specified dimensions (" << explicit_count << ")"; - const int inferred_dim = bottom[0]->count() / explicit_count; - top_shape[start_axis + inferred_axis_] = inferred_dim; - } - top[0]->Reshape(top_shape); - CHECK_EQ(top[0]->count(), bottom[0]->count()) - << "output count must match input count"; - top[0]->ShareData(*bottom[0]); - top[0]->ShareDiff(*bottom[0]); + const vector*>& top) { + const int input_start_axis = this->layer_param_.reshape_param().axis(); + const int start_axis = + (input_start_axis >= 0) ? input_start_axis : + bottom[0]->num_axes() + input_start_axis + 1; + CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; + CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis + << " out of range for " << bottom[0]->num_axes() << "-D input blob"; + const int num_axes = this->layer_param_.reshape_param().num_axes(); + CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all"; + const int end_axis = + (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); + CHECK_LE(end_axis, bottom[0]->num_axes()) + << "end_axis = axis + num_axes is out of range"; + const int num_axes_replaced = end_axis - start_axis; + const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced; + const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); + const int num_new_axes = top_blob_shape.dim_size(); + vector top_shape(num_axes_retained + num_new_axes); + int top_shape_index = 0; + for (int i = 0; i < start_axis; ++i) { + top_shape[top_shape_index++] = bottom[0]->shape(i); + } + for (int i = 0; i < num_new_axes; ++i) { + top_shape[top_shape_index++] = top_blob_shape.dim(i); + } + for (int i = end_axis; i < bottom[0]->num_axes(); ++i) { + top_shape[top_shape_index++] = bottom[0]->shape(i); + } + CHECK_EQ(top_shape_index, top_shape.size()); + for (int i = 0; i < copy_axes_.size(); ++i) { + const int copy_axis_index = copy_axes_[i]; + CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) + << "new shape contains a 0, but there was no corresponding bottom axis " + << "to copy"; + top_shape[start_axis + copy_axis_index] = + bottom[0]->shape(start_axis + copy_axis_index); + } + if (inferred_axis_ >= 0) { + // A -1 dim was specified; infer the correct dimension by computing the + // product of the other dimensions. + int explicit_count = constant_count_; + explicit_count *= bottom[0]->count(0, start_axis); + explicit_count *= bottom[0]->count(end_axis); + for (int i = 0; i < copy_axes_.size(); ++i) { + const int copy_axis_index = copy_axes_[i]; + explicit_count *= top_shape[start_axis + copy_axis_index]; + } + CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count (" + << bottom[0]->count() << ") must be divisible by the product of " + << "the specified dimensions (" << explicit_count << ")"; + const int inferred_dim = bottom[0]->count() / explicit_count; + top_shape[start_axis + inferred_axis_] = inferred_dim; + } + top[0]->Reshape(top_shape); + CHECK_EQ(top[0]->count(), bottom[0]->count()) + << "output count must match input count"; + top[0]->ShareData(*bottom[0]); + top[0]->ShareDiff(*bottom[0]); } -INSTANTIATE_CLASS(ReshapeLayer); -REGISTER_LAYER_CLASS(Reshape); +INSTANTIATE_CLASS (ReshapeLayer); +REGISTER_LAYER_CLASS (Reshape); } // namespace caffe diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 1c22fe19..a5be48e7 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -8,95 +8,96 @@ namespace caffe { -template +template void SigmoidCrossEntropyLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); - sigmoid_bottom_vec_.clear(); - sigmoid_bottom_vec_.push_back(bottom[0]); - sigmoid_top_vec_.clear(); - sigmoid_top_vec_.push_back(sigmoid_output_.get()); - sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + sigmoid_bottom_vec_.clear(); + sigmoid_bottom_vec_.push_back(bottom[0]); + sigmoid_top_vec_.clear(); + sigmoid_top_vec_.push_back(sigmoid_output_.get()); + sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_); } -template +template void SigmoidCrossEntropyLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - CHECK_EQ(bottom[0]->count(), bottom[1]->count()) << - "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; - sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[0]->count(), bottom[1]->count()) << + "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; + sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); } -template +template void SigmoidCrossEntropyLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // The forward pass computes the sigmoid outputs. - sigmoid_bottom_vec_[0] = bottom[0]; - sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_); - // Compute the loss (negative log likelihood) - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype loss = 0; - for (int i = 0; i < count; ++i) { - loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& bottom, const vector*>& top) { + // The forward pass computes the sigmoid outputs. + sigmoid_bottom_vec_[0] = bottom[0]; + sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_); + // Compute the loss (negative log likelihood) + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + // Stable version of loss computation from input data + const Dtype* input_data = bottom[0]->cpu_data(); + const Dtype* target = bottom[1]->cpu_data(); + Dtype loss = 0; + for (int i = 0; i < count; ++i) { + loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - + log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); + } + top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void SigmoidCrossEntropyLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_sub(count, sigmoid_output_data, target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_scal(count, loss_weight / num, bottom_diff); - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); + const Dtype* target = bottom[1]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_sub(count, sigmoid_output_data, target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_scal(count, loss_weight / num, bottom_diff); + } } -template -void SigmoidCrossEntropyLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = bottom[1]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); - } +template +void SigmoidCrossEntropyLossLayer::Backward_gpu( + const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); + const Dtype* target = bottom[1]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); + caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_gpu_scal(count, loss_weight / num, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward); #endif -INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer); -REGISTER_LAYER_CLASS(SigmoidCrossEntropyLoss); +INSTANTIATE_CLASS (SigmoidCrossEntropyLossLayer); +REGISTER_LAYER_CLASS (SigmoidCrossEntropyLoss); } // namespace caffe diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index fa13a4c1..4095ccdb 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -8,66 +8,65 @@ namespace caffe { -template +template inline Dtype sigmoid(Dtype x) { - return 1. / (1. + exp(-x)); + return 1. / (1. + exp(-x)); } -template +template void SigmoidLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = sigmoid(bottom_data[i]); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = sigmoid(bottom_data[i]); + } } -template +template void SigmoidLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - const Dtype sigmoid_x = top_data[i]; - bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + const Dtype sigmoid_x = top_data[i]; + bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x); + } + } } -template +template void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidForward(count, bottom_data, top_data); } -template +template void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidBackward(count, top_diff, top_data, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidBackward(count, top_diff, top_data, bottom_diff); + } } #ifdef CPU_ONLY STUB_GPU(SigmoidLayer); #endif -INSTANTIATE_CLASS(SigmoidLayer); - +INSTANTIATE_CLASS (SigmoidLayer); } // namespace caffe diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index e36a5cad..05929a70 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -6,39 +6,39 @@ namespace caffe { -template +template void SilenceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_cpu_data()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_cpu_data()); + } + } } -template +template void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Do nothing. + const vector*>& top) { + // Do nothing. } -template +template void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_gpu_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_gpu_data()); + } + } } #ifdef CPU_ONLY STUB_GPU(SilenceLayer); #endif -INSTANTIATE_CLASS(SilenceLayer); -REGISTER_LAYER_CLASS(Silence); +INSTANTIATE_CLASS (SilenceLayer); +REGISTER_LAYER_CLASS (Silence); } // namespace caffe diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index 76021faa..7b327527 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -7,124 +7,126 @@ namespace caffe { -template +template void SliceLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const SliceParameter& slice_param = this->layer_param_.slice_param(); - CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) - << "Either axis or slice_dim should be specified; not both."; - slice_point_.clear(); - std::copy(slice_param.slice_point().begin(), - slice_param.slice_point().end(), - std::back_inserter(slice_point_)); + const vector*>& top) { + const SliceParameter& slice_param = this->layer_param_.slice_param(); + CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) + << "Either axis or slice_dim should be specified; not both."; + slice_point_.clear(); + std::copy(slice_param.slice_point().begin(), + slice_param.slice_point().end(), + std::back_inserter(slice_point_)); } -template +template void SliceLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int num_axes = bottom[0]->num_axes(); - const SliceParameter& slice_param = this->layer_param_.slice_param(); - if (slice_param.has_slice_dim()) { - slice_axis_ = static_cast(slice_param.slice_dim()); - // Don't allow negative indexing for slice_dim, a uint32 -- almost - // certainly unintended. - CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 " - << "produced negative result; slice_dim must satisfy " - << "0 <= slice_dim < " << kMaxBlobAxes; - CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range."; - } else { - slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis()); - } - vector top_shape = bottom[0]->shape(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - num_slices_ = bottom[0]->count(0, slice_axis_); - slice_size_ = bottom[0]->count(slice_axis_ + 1); - int count = 0; - if (slice_point_.size() != 0) { - CHECK_EQ(slice_point_.size(), top.size() - 1); - CHECK_LE(top.size(), bottom_slice_axis); - int prev = 0; - vector slices; - for (int i = 0; i < slice_point_.size(); ++i) { - CHECK_GT(slice_point_[i], prev); - slices.push_back(slice_point_[i] - prev); - prev = slice_point_[i]; - } - slices.push_back(bottom_slice_axis - prev); - for (int i = 0; i < top.size(); ++i) { - top_shape[slice_axis_] = slices[i]; - top[i]->Reshape(top_shape); - count += top[i]->count(); - } - } else { - CHECK_EQ(bottom_slice_axis % top.size(), 0) - << "Number of top blobs (" << top.size() << ") should evenly " - << "divide input slice axis (" << bottom_slice_axis << ")"; - top_shape[slice_axis_] = bottom_slice_axis / top.size(); - for (int i = 0; i < top.size(); ++i) { - top[i]->Reshape(top_shape); - count += top[i]->count(); - } - } - CHECK_EQ(count, bottom[0]->count()); + const vector*>& top) { + const int num_axes = bottom[0]->num_axes(); + const SliceParameter& slice_param = this->layer_param_.slice_param(); + if (slice_param.has_slice_dim()) { + slice_axis_ = static_cast(slice_param.slice_dim()); + // Don't allow negative indexing for slice_dim, a uint32 -- almost + // certainly unintended. + CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 " + << "produced negative result; slice_dim must satisfy " + << "0 <= slice_dim < " << kMaxBlobAxes; + CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range."; + } else { + slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis()); + } + vector top_shape = bottom[0]->shape(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + num_slices_ = bottom[0]->count(0, slice_axis_); + slice_size_ = bottom[0]->count(slice_axis_ + 1); + int count = 0; + if (slice_point_.size() != 0) { + CHECK_EQ(slice_point_.size(), top.size() - 1); + CHECK_LE(top.size(), bottom_slice_axis); + int prev = 0; + vector slices; + for (int i = 0; i < slice_point_.size(); ++i) { + CHECK_GT(slice_point_[i], prev); + slices.push_back(slice_point_[i] - prev); + prev = slice_point_[i]; + } + slices.push_back(bottom_slice_axis - prev); + for (int i = 0; i < top.size(); ++i) { + top_shape[slice_axis_] = slices[i]; + top[i]->Reshape(top_shape); + count += top[i]->count(); + } + } else { + CHECK_EQ(bottom_slice_axis % top.size(), 0) + << "Number of top blobs (" << top.size() << ") should evenly " + << "divide input slice axis (" << bottom_slice_axis << ")"; + top_shape[slice_axis_] = bottom_slice_axis / top.size(); + for (int i = 0; i < top.size(); ++i) { + top[i]->Reshape(top_shape); + count += top[i]->count(); + } + } + CHECK_EQ(count, bottom[0]->count()); } -template +template void SliceLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int offset_slice_axis = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - for (int i = 0; i < top.size(); ++i) { - Dtype* top_data = top[i]->mutable_cpu_data(); - const int top_slice_axis = top[i]->shape(slice_axis_); - for (int n = 0; n < num_slices_; ++n) { - const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - bottom_data + bottom_offset, top_data + top_offset); - } - offset_slice_axis += top_slice_axis; - } + const vector*>& top) { + int offset_slice_axis = 0; + const Dtype* bottom_data = bottom[0]->cpu_data(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + for (int i = 0; i < top.size(); ++i) { + Dtype* top_data = top[i]->mutable_cpu_data(); + const int top_slice_axis = top[i]->shape(slice_axis_); + for (int n = 0; n < num_slices_; ++n) { + const int top_offset = n * top_slice_axis * slice_size_; + const int bottom_offset = + (n * bottom_slice_axis + offset_slice_axis) * slice_size_; + caffe_copy(top_slice_axis * slice_size_, + bottom_data + bottom_offset, top_data + top_offset); + } + offset_slice_axis += top_slice_axis; + } } -template +template void SliceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - int offset_slice_axis = 0; - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const int top_slice_axis = top[i]->shape(slice_axis_); - for (int n = 0; n < num_slices_; ++n) { - const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - top_diff + top_offset, bottom_diff + bottom_offset); - } - offset_slice_axis += top_slice_axis; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + int offset_slice_axis = 0; + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const int top_slice_axis = top[i]->shape(slice_axis_); + for (int n = 0; n < num_slices_; ++n) { + const int top_offset = n * top_slice_axis * slice_size_; + const int bottom_offset = + (n * bottom_slice_axis + offset_slice_axis) * slice_size_; + caffe_copy(top_slice_axis * slice_size_, + top_diff + top_offset, bottom_diff + bottom_offset); + } + offset_slice_axis += top_slice_axis; + } } -template +template void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ + const vector*>& top) { } -template +template void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ + const vector& propagate_down, const vector*>& bottom) { } #ifdef CPU_ONLY STUB_GPU(SliceLayer); #endif -INSTANTIATE_CLASS(SliceLayer); -REGISTER_LAYER_CLASS(Slice); +INSTANTIATE_CLASS (SliceLayer); +REGISTER_LAYER_CLASS (Slice); } // namespace caffe diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 27c18b7b..117a966f 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -1,4 +1,4 @@ -#include +s#include #include #include "caffe/layer.hpp" @@ -7,152 +7,151 @@ namespace caffe { -template +template void SoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); - top[0]->ReshapeLike(*bottom[0]); - vector mult_dims(1, bottom[0]->shape(softmax_axis_)); - sum_multiplier_.Reshape(mult_dims); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); - caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); - outer_num_ = bottom[0]->count(0, softmax_axis_); - inner_num_ = bottom[0]->count(softmax_axis_ + 1); - vector scale_dims = bottom[0]->shape(); - scale_dims[softmax_axis_] = 1; - scale_.Reshape(scale_dims); + const vector*>& top) { + softmax_axis_ = + bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + top[0]->ReshapeLike(*bottom[0]); + vector mult_dims(1, bottom[0]->shape(softmax_axis_)); + sum_multiplier_.Reshape(mult_dims); + Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); + outer_num_ = bottom[0]->count(0, softmax_axis_); + inner_num_ = bottom[0]->count(softmax_axis_ + 1); + vector scale_dims = bottom[0]->shape(); + scale_dims[softmax_axis_] = 1; + scale_.Reshape(scale_dims); } - -template -SoftmaxLayer::~SoftmaxLayer(){ +template +SoftmaxLayer::~SoftmaxLayer() { } -template +template void SoftmaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); - int channels = bottom[0]->shape(softmax_axis_); - int dim = bottom[0]->count() / outer_num_; - caffe_copy(bottom[0]->count(), bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - for (int i = 0; i < outer_num_; ++i) { - // initialize scale_data to the first plane - caffe_copy(inner_num_, bottom_data + i * dim, scale_data); - for (int j = 0; j < channels; j++) { - for (int k = 0; k < inner_num_; k++) { - scale_data[k] = std::max(scale_data[k], - bottom_data[i * dim + j * inner_num_ + k]); - } - } - // subtraction - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, inner_num_, - 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); - // exponentiation - caffe_exp(dim, top_data, top_data); - // sum after exp - caffe_cpu_gemv(CblasTrans, channels, inner_num_, 1., - top_data, sum_multiplier_.cpu_data(), 0., scale_data); - // division - for (int j = 0; j < channels; j++) { - caffe_div(inner_num_, top_data, scale_data, top_data); - top_data += inner_num_; - } - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + Dtype* scale_data = scale_.mutable_cpu_data(); + int channels = bottom[0]->shape(softmax_axis_); + int dim = bottom[0]->count() / outer_num_; + caffe_copy(bottom[0]->count(), bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + for (int i = 0; i < outer_num_; ++i) { + // initialize scale_data to the first plane + caffe_copy(inner_num_, bottom_data + i * dim, scale_data); + for (int j = 0; j < channels; j++) { + for (int k = 0; k < inner_num_; k++) { + scale_data[k] = std::max(scale_data[k], + bottom_data[i * dim + j * inner_num_ + k]); + } + } + // subtraction + caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_, + 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); + // exponentiation + caffe_exp < Dtype > (dim, top_data, top_data); + // sum after exp + caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1., + top_data, sum_multiplier_.cpu_data(), 0., scale_data); + // division + for (int j = 0; j < channels; j++) { + caffe_div(inner_num_, top_data, scale_data, top_data); + top_data += inner_num_; + } + } } -template +template void SoftmaxLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - Dtype* scale_data = scale_.mutable_cpu_data(); - int channels = top[0]->shape(softmax_axis_); - int dim = top[0]->count() / outer_num_; - caffe_copy(top[0]->count(), top_diff, bottom_diff); - for (int i = 0; i < outer_num_; ++i) { - // compute dot(top_diff, top_data) and subtract them from the bottom diff - for (int k = 0; k < inner_num_; ++k) { - scale_data[k] = caffe_cpu_strided_dot(channels, - bottom_diff + i * dim + k, inner_num_, - top_data + i * dim + k, inner_num_); - } - // subtraction - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, - -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim); - } - // elementwise multiplication - caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Dtype* scale_data = scale_.mutable_cpu_data(); + int channels = top[0]->shape(softmax_axis_); + int dim = top[0]->count() / outer_num_; + caffe_copy(top[0]->count(), top_diff, bottom_diff); + for (int i = 0; i < outer_num_; ++i) { + // compute dot(top_diff, top_data) and subtract them from the bottom diff + for (int k = 0; k < inner_num_; ++k) { + scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels, + bottom_diff + i * dim + k, inner_num_, + top_data + i * dim + k, inner_num_); + } + // subtraction + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, + -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim); + } + // elementwise multiplication + caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } -template +template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = bottom[0]->count(); - int channels = top[0]->shape(softmax_axis_); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = bottom[0]->count(); + int channels = top[0]->shape(softmax_axis_); + + caffe_gpu_copy(count, bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + // compute max + // NOLINT_NEXT_LINE(whitespace/operators) - caffe_gpu_copy(count, bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - // compute max - // NOLINT_NEXT_LINE(whitespace/operators) - - kernel_channel_max(outer_num_, channels, inner_num_, top_data, - scale_data); - // subtract - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract(count, outer_num_, channels, inner_num_, - scale_data, top_data); - // exponentiate - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp(count, top_data, top_data); - // sum after exp - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum(outer_num_, channels, inner_num_, top_data, - scale_data); - // divide - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div(count, outer_num_, channels, inner_num_, - scale_data, top_data); + kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data, + scale_data); + // subtract + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, + scale_data, top_data); + // exponentiate + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_exp < Dtype > (count, top_data, top_data); + // sum after exp + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data, + scale_data); + // divide + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_, + scale_data, top_data); } -template +template void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = top[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_gpu_copy(count, top_diff, bottom_diff); - // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. - // NOLINT_NEXT_LINE(whitespace/operators) - - kernel_channel_dot(outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract(count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); - // elementwise multiplication - caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = top[0]->count(); + int channels = top[0]->shape(softmax_axis_); + caffe_gpu_copy(count, top_diff, bottom_diff); + // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. + // NOLINT_NEXT_LINE(whitespace/operators) -} + kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_, + top_diff, top_data, scale_data); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, + scale_data, bottom_diff); + // elementwise multiplication + caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); +} #ifdef CPU_ONLY STUB_GPU(SoftmaxLayer); #endif -INSTANTIATE_CLASS(SoftmaxLayer); +INSTANTIATE_CLASS (SoftmaxLayer); } // namespace caffe diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 66ac9ea5..b998c2f6 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -9,201 +9,202 @@ namespace caffe { -template +template void SoftmaxWithLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); - LayerParameter softmax_param(this->layer_param_); - softmax_param.set_type("Softmax"); - softmax_layer_ = LayerRegistry::CreateLayer(softmax_param); - softmax_bottom_vec_.clear(); - softmax_bottom_vec_.push_back(bottom[0]); - softmax_top_vec_.clear(); - softmax_top_vec_.push_back(&prob_); - softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); - - has_ignore_label_ = - this->layer_param_.loss_param().has_ignore_label(); - if (has_ignore_label_) { - ignore_label_ = this->layer_param_.loss_param().ignore_label(); - } - normalize_ = this->layer_param_.loss_param().normalize(); - - ocl_setup(); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + LayerParameter softmax_param(this->layer_param_); + softmax_param.set_type("Softmax"); + softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param); + softmax_bottom_vec_.clear(); + softmax_bottom_vec_.push_back(bottom[0]); + softmax_top_vec_.clear(); + softmax_top_vec_.push_back(&prob_); + softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); + + has_ignore_label_ = + this->layer_param_.loss_param().has_ignore_label(); + if (has_ignore_label_) { + ignore_label_ = this->layer_param_.loss_param().ignore_label(); + } + normalize_ = this->layer_param_.loss_param().normalize(); + + ocl_setup(); } -template -void SoftmaxWithLossLayer::ocl_setup(){ - d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL); +template +void SoftmaxWithLossLayer::ocl_setup() { + d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + sizeof(Dtype), NULL, NULL); } -template -SoftmaxWithLossLayer::~SoftmaxWithLossLayer(){ +template +SoftmaxWithLossLayer::~SoftmaxWithLossLayer() { } -template +template void SoftmaxWithLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); - outer_num_ = bottom[0]->count(0, softmax_axis_); - inner_num_ = bottom[0]->count(softmax_axis_ + 1); - CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; - if (top.size() >= 2) { - // softmax output - top[1]->ReshapeLike(*bottom[0]); - } + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); + softmax_axis_ = + bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + outer_num_ = bottom[0]->count(0, softmax_axis_); + inner_num_ = bottom[0]->count(softmax_axis_ + 1); + CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) + << "Number of labels must match number of predictions; " + << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; + if (top.size() >= 2) { + // softmax output + top[1]->ReshapeLike(*bottom[0]); + } } -template +template void SoftmaxWithLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // The forward pass computes the softmax prob values. - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.cpu_data(); - const Dtype* label = bottom[1]->cpu_data(); - int dim = prob_.count() / outer_num_; - int count = 0; - Dtype loss = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; j++) { - const int label_value = static_cast(label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - continue; - } - DCHECK_GE(label_value, 0); - DCHECK_LT(label_value, prob_.shape(softmax_axis_)); - loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j], - Dtype(FLT_MIN))); - ++count; - } - } - if (normalize_) { - top[0]->mutable_cpu_data()[0] = loss / count; - } else { - top[0]->mutable_cpu_data()[0] = loss / outer_num_; - } - if (top.size() == 2) { - top[1]->ShareData(prob_); - } + const vector*>& bottom, const vector*>& top) { + // The forward pass computes the softmax prob values. + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.cpu_data(); + const Dtype* label = bottom[1]->cpu_data(); + int dim = prob_.count() / outer_num_; + int count = 0; + Dtype loss = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; j++) { + const int label_value = static_cast(label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + continue; + } + DCHECK_GE(label_value, 0); + DCHECK_LT(label_value, prob_.shape(softmax_axis_)); + loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j], + Dtype(FLT_MIN))); + ++count; + } + } + if (normalize_) { + top[0]->mutable_cpu_data()[0] = loss / count; + } else { + top[0]->mutable_cpu_data()[0] = loss / outer_num_; + } + if (top.size() == 2) { + top[1]->ShareData(prob_); + } } -template +template void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* prob_data = prob_.cpu_data(); - caffe_copy(prob_.count(), prob_data, bottom_diff); - const Dtype* label = bottom[1]->cpu_data(); - int dim = prob_.count() / outer_num_; - int count = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; ++j) { - const int label_value = static_cast(label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) { - bottom_diff[i * dim + c * inner_num_ + j] = 0; - } - } else { - bottom_diff[i * dim + label_value * inner_num_ + j] -= 1; - ++count; - } - } - } - // Scale gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - caffe_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* prob_data = prob_.cpu_data(); + caffe_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->cpu_data(); + int dim = prob_.count() / outer_num_; + int count = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; ++j) { + const int label_value = static_cast(label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) { + bottom_diff[i * dim + c * inner_num_ + j] = 0; + } + } else { + bottom_diff[i * dim + label_value * inner_num_ + j] -= 1; + ++count; + } + } + } + // Scale gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + caffe_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } } -template +template void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is not used for anything until it is overwritten - // on the backward pass, we use it here to avoid having to allocate new GPU - // memory to accumulate intermediate results in the kernel. - Dtype* loss_data = bottom[0]->mutable_gpu_diff(); - // Similarly, this memory is never used elsewhere, and thus we can use it - // to avoid having to allocate additional GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU( nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - Dtype loss; - caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; - } - printf("loss = %f\n", loss); - top[0]->mutable_cpu_data()[0] = loss; - if (top.size() == 2) { - top[1]->ShareData(prob_); - } + const vector*>& bottom, const vector*>& top) { + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is not used for anything until it is overwritten + // on the backward pass, we use it here to avoid having to allocate new GPU + // memory to accumulate intermediate results in the kernel. + Dtype* loss_data = bottom[0]->mutable_gpu_diff(); + // Similarly, this memory is never used elsewhere, and thus we can use it + // to avoid having to allocate additional GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data, + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + Dtype loss; + caffe_gpu_asum(nthreads, loss_data, &loss); + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + loss /= count; + } else { + loss /= outer_num_; + } + printf("loss = %f\n", loss); + top[0]->mutable_cpu_data()[0] = loss; + if (top.size() == 2) { + top[1]->ShareData(prob_); + } } -template +template void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is never used for anything else, - // we use to to avoid allocating new GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU(nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is never used for anything else, + // we use to to avoid allocating new GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff, + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } } #ifdef CPU_ONLY STUB_GPU(SoftmaxWithLossLayer); #endif -INSTANTIATE_CLASS(SoftmaxWithLossLayer); -REGISTER_LAYER_CLASS(SoftmaxWithLoss); +INSTANTIATE_CLASS (SoftmaxWithLossLayer); +REGISTER_LAYER_CLASS (SoftmaxWithLoss); } // namespace caffe diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 4b60db10..0ad8179a 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -6,74 +6,79 @@ namespace caffe { -template +template void SplitLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - count_ = bottom[0]->count(); - for (int i = 0; i < top.size(); ++i) { - // Do not allow in-place computation in the SplitLayer. Instead, share data - // by reference in the forward pass, and keep separate diff allocations in - // the backward pass. (Technically, it should be possible to share the diff - // blob of the first split output with the input, but this seems to cause - // some strange effects in practice...) - CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; - top[i]->ReshapeLike(*bottom[0]); - CHECK_EQ(count_, top[i]->count()); - } - gpu_add_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_add_float",NULL); + const vector*>& top) { + count_ = bottom[0]->count(); + for (int i = 0; i < top.size(); ++i) { + // Do not allow in-place computation in the SplitLayer. Instead, share data + // by reference in the forward pass, and keep separate diff allocations in + // the backward pass. (Technically, it should be possible to share the diff + // blob of the first split output with the input, but this seems to cause + // some strange effects in practice...) + CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not " + "allow in-place computation."; + top[i]->ReshapeLike(*bottom[0]); + CHECK_EQ(count_, top[i]->count()); + } + gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", + NULL); } -template +template void SplitLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } } -template +template void SplitLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - if (top.size() == 1) { - caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff()); - return; - } - caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), - bottom[0]->mutable_cpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + if (top.size() == 1) { + caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff()); + return; + } + caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), + bottom[0]->mutable_cpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } } -template +template void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } } -template +template void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (!propagate_down[0]) { return; } - if (top.size() == 1) { - caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); - return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + if (top.size() == 1) { + caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); + return; + } + caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + bottom[0]->mutable_gpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } } @@ -81,7 +86,7 @@ void SplitLayer::Backward_gpu(const vector*>& top, STUB_GPU(SplitLayer); #endif -INSTANTIATE_CLASS(SplitLayer); -REGISTER_LAYER_CLASS(Split); +INSTANTIATE_CLASS (SplitLayer); +REGISTER_LAYER_CLASS (Split); } // namespace caffe diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp index 795dd716..bfc7778c 100644 --- a/src/caffe/layers/spp_layer.cpp +++ b/src/caffe/layers/spp_layer.cpp @@ -13,181 +13,180 @@ namespace caffe { using std::min; using std::max; -template +template LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param) { - LayerParameter pooling_param; - int num_bins = pow(2, pyramid_level); - - // find padding and kernel size so that the pooling is - // performed across the entire image - int kernel_h = ceil(bottom_h / static_cast(num_bins)); - // remainder_h is the min number of pixels that need to be padded before - // entire image height is pooled over with the chosen kernel dimension - int remainder_h = kernel_h * num_bins - bottom_h; - // pooling layer pads (2 * pad_h) pixels on the top and bottom of the - // image. - int pad_h = (remainder_h + 1) / 2; - - // similar logic for width - int kernel_w = ceil(bottom_w / static_cast(num_bins)); - int remainder_w = kernel_w * num_bins - bottom_w; - int pad_w = (remainder_w + 1) / 2; - - pooling_param.mutable_pooling_param()->set_pad_h(pad_h); - pooling_param.mutable_pooling_param()->set_pad_w(pad_w); - pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h); - pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w); - pooling_param.mutable_pooling_param()->set_stride_h(kernel_h); - pooling_param.mutable_pooling_param()->set_stride_w(kernel_w); - - switch (spp_param.pool()) { - case SPPParameter_PoolMethod_MAX: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); - break; - case SPPParameter_PoolMethod_AVE: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - break; - case SPPParameter_PoolMethod_STOCHASTIC: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - - return pooling_param; + const int bottom_h, const int bottom_w, const SPPParameter spp_param) { + LayerParameter pooling_param; + int num_bins = pow(2, pyramid_level); + + // find padding and kernel size so that the pooling is + // performed across the entire image + int kernel_h = ceil(bottom_h / static_cast(num_bins)); + // remainder_h is the min number of pixels that need to be padded before + // entire image height is pooled over with the chosen kernel dimension + int remainder_h = kernel_h * num_bins - bottom_h; + // pooling layer pads (2 * pad_h) pixels on the top and bottom of the + // image. + int pad_h = (remainder_h + 1) / 2; + + // similar logic for width + int kernel_w = ceil(bottom_w / static_cast(num_bins)); + int remainder_w = kernel_w * num_bins - bottom_w; + int pad_w = (remainder_w + 1) / 2; + + pooling_param.mutable_pooling_param()->set_pad_h(pad_h); + pooling_param.mutable_pooling_param()->set_pad_w(pad_w); + pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h); + pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w); + pooling_param.mutable_pooling_param()->set_stride_h(kernel_h); + pooling_param.mutable_pooling_param()->set_stride_w(kernel_w); + + switch (spp_param.pool()) { + case SPPParameter_PoolMethod_MAX: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_MAX); + break; + case SPPParameter_PoolMethod_AVE: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + break; + case SPPParameter_PoolMethod_STOCHASTIC: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_STOCHASTIC); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + + return pooling_param; } -template +template void SPPLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SPPParameter spp_param = this->layer_param_.spp_param(); - - bottom_h_ = bottom[0]->height(); - bottom_w_ = bottom[0]->width(); - CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero."; - CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero."; - - pyramid_height_ = spp_param.pyramid_height(); - split_top_vec_.clear(); - pooling_bottom_vecs_.clear(); - pooling_layers_.clear(); - pooling_top_vecs_.clear(); - pooling_outputs_.clear(); - flatten_layers_.clear(); - flatten_top_vecs_.clear(); - flatten_outputs_.clear(); - concat_bottom_vec_.clear(); - - // split layer output holders setup - for (int i = 0; i < pyramid_height_; i++) { - split_top_vec_.push_back(new Blob()); - } - - // split layer setup - LayerParameter split_param; - split_layer_.reset(new SplitLayer(split_param)); - split_layer_->SetUp(bottom, split_top_vec_); - - for (int i = 0; i < pyramid_height_; i++) { - // pooling layer input holders setup - pooling_bottom_vecs_.push_back(new vector*>); - pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]); - - // pooling layer output holders setup - pooling_outputs_.push_back(new Blob()); - pooling_top_vecs_.push_back(new vector*>); - pooling_top_vecs_[i]->push_back(pooling_outputs_[i]); - - // pooling layer setup - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); - - pooling_layers_.push_back(shared_ptr > ( - new PoolingLayer(pooling_param))); - pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - - // flatten layer output holders setup - flatten_outputs_.push_back(new Blob()); - flatten_top_vecs_.push_back(new vector*>); - flatten_top_vecs_[i]->push_back(flatten_outputs_[i]); - - // flatten layer setup - LayerParameter flatten_param; - flatten_layers_.push_back(new FlattenLayer(flatten_param)); - flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); - - // concat layer input holders setup - concat_bottom_vec_.push_back(flatten_outputs_[i]); - } - - // concat layer setup - LayerParameter concat_param; - concat_layer_.reset(new ConcatLayer(concat_param)); - concat_layer_->SetUp(concat_bottom_vec_, top); + const vector*>& top) { + SPPParameter spp_param = this->layer_param_.spp_param(); + + bottom_h_ = bottom[0]->height(); + bottom_w_ = bottom[0]->width(); + CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero."; + CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero."; + + pyramid_height_ = spp_param.pyramid_height(); + split_top_vec_.clear(); + pooling_bottom_vecs_.clear(); + pooling_layers_.clear(); + pooling_top_vecs_.clear(); + pooling_outputs_.clear(); + flatten_layers_.clear(); + flatten_top_vecs_.clear(); + flatten_outputs_.clear(); + concat_bottom_vec_.clear(); + + // split layer output holders setup + for (int i = 0; i < pyramid_height_; i++) { + split_top_vec_.push_back(new Blob()); + } + + // split layer setup + LayerParameter split_param; + split_layer_.reset(new SplitLayer(split_param)); + split_layer_->SetUp(bottom, split_top_vec_); + + for (int i = 0; i < pyramid_height_; i++) { + // pooling layer input holders setup + pooling_bottom_vecs_.push_back(new vector*>); + pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]); + + // pooling layer output holders setup + pooling_outputs_.push_back(new Blob()); + pooling_top_vecs_.push_back(new vector*>); + pooling_top_vecs_[i]->push_back(pooling_outputs_[i]); + + // pooling layer setup + LayerParameter pooling_param = GetPoolingParam( + i, bottom_h_, bottom_w_, spp_param); + + pooling_layers_.push_back(shared_ptr < PoolingLayer > ( + new PoolingLayer(pooling_param))); + pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + + // flatten layer output holders setup + flatten_outputs_.push_back(new Blob()); + flatten_top_vecs_.push_back(new vector*>); + flatten_top_vecs_[i]->push_back(flatten_outputs_[i]); + + // flatten layer setup + LayerParameter flatten_param; + flatten_layers_.push_back(new FlattenLayer(flatten_param)); + flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); + + // concat layer input holders setup + concat_bottom_vec_.push_back(flatten_outputs_[i]); + } + + // concat layer setup + LayerParameter concat_param; + concat_layer_.reset(new ConcatLayer(concat_param)); + concat_layer_->SetUp(concat_bottom_vec_, top); } -template +template void SPPLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - bottom_h_ = bottom[0]->height(); - bottom_w_ = bottom[0]->width(); - SPPParameter spp_param = this->layer_param_.spp_param(); - split_layer_->Reshape(bottom, split_top_vec_); - for (int i = 0; i < pyramid_height_; i++) { - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); - - pooling_layers_[i].reset( - new PoolingLayer(pooling_param)); - pooling_layers_[i]->SetUp( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - pooling_layers_[i]->Reshape( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Reshape( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); - } - concat_layer_->Reshape(concat_bottom_vec_, top); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + bottom_h_ = bottom[0]->height(); + bottom_w_ = bottom[0]->width(); + SPPParameter spp_param = this->layer_param_.spp_param(); + split_layer_->Reshape(bottom, split_top_vec_); + for (int i = 0; i < pyramid_height_; i++) { + LayerParameter pooling_param = GetPoolingParam( + i, bottom_h_, bottom_w_, spp_param); + + pooling_layers_[i].reset( + new PoolingLayer(pooling_param)); + pooling_layers_[i]->SetUp( + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + pooling_layers_[i]->Reshape( + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + flatten_layers_[i]->Reshape( + *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + } + concat_layer_->Reshape(concat_bottom_vec_, top); } -template +template void SPPLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - split_layer_->Forward(bottom, split_top_vec_); - for (int i = 0; i < pyramid_height_; i++) { - pooling_layers_[i]->Forward( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Forward( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); - } - concat_layer_->Forward(concat_bottom_vec_, top); + const vector*>& top) { + split_layer_->Forward(bottom, split_top_vec_); + for (int i = 0; i < pyramid_height_; i++) { + pooling_layers_[i]->Forward( + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + flatten_layers_[i]->Forward( + *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + } + concat_layer_->Forward(concat_bottom_vec_, top); } -template +template void SPPLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - vector concat_propagate_down(pyramid_height_, true); - concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); - for (int i = 0; i < pyramid_height_; i++) { - flatten_layers_[i]->Backward( - *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); - pooling_layers_[i]->Backward( - *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); - } - split_layer_->Backward(split_top_vec_, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + vector concat_propagate_down(pyramid_height_, true); + concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); + for (int i = 0; i < pyramid_height_; i++) { + flatten_layers_[i]->Backward( + *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); + pooling_layers_[i]->Backward( + *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); + } + split_layer_->Backward(split_top_vec_, propagate_down, bottom); } - -INSTANTIATE_CLASS(SPPLayer); -REGISTER_LAYER_CLASS(SPP); +INSTANTIATE_CLASS (SPPLayer); +REGISTER_LAYER_CLASS (SPP); } // namespace caffe diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index a922adbd..16405761 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -10,63 +10,61 @@ namespace caffe { -template +template void TanHLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = tanh(bottom_data[i]); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = tanh(bottom_data[i]); + } } -template +template void TanHLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype tanhx; - for (int i = 0; i < count; ++i) { - tanhx = top_data[i]; - bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx); - } - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype tanhx; + for (int i = 0; i < count; ++i) { + tanhx = top_data[i]; + bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx); + } + } } -template +template void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHForward(count, bottom_data, top_data); } -template +template void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom){ - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHBackward(count, top_diff, top_data, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHBackward(count, top_diff, top_data, bottom_diff); + } } -} - - #ifdef CPU_ONLY STUB_GPU(TanHLayer); #endif -INSTANTIATE_CLASS(TanHLayer); +INSTANTIATE_CLASS (TanHLayer); } // namespace caffe diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index b3e1bea7..ca14de00 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -4,42 +4,41 @@ #include "caffe/vision_layers.hpp" #include "caffe/util/ocl_wrapper.hpp" - namespace caffe { -template +template void ThresholdLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - threshold_ = this->layer_param_.threshold_param().threshold(); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.threshold_param().threshold(); } -template +template void ThresholdLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); + } } -template +template void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top){ - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - ThresholdForward(count, threshold_, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + ThresholdForward(count, threshold_, bottom_data, top_data); } #ifdef CPU_ONLY STUB_GPU_FORWARD(ThresholdLayer, Forward); #endif -INSTANTIATE_CLASS(ThresholdLayer); -REGISTER_LAYER_CLASS(Threshold); +INSTANTIATE_CLASS (ThresholdLayer); +REGISTER_LAYER_CLASS (Threshold); } // namespace caffe diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index cc7dc79d..0525b640 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -25,409 +25,410 @@ namespace caffe { -template +template WindowDataLayer::~WindowDataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } -template +template void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - // LayerSetUp runs through the window_file and creates two structures - // that hold windows: one for foreground (object) windows and one - // for background (non-object) windows. We use an overlap threshold - // to decide which is which. - - // window_file format - // repeated: - // # image_index - // img_path (abs path) - // channels - // height - // width - // num_windows - // class_index overlap x1 y1 x2 y2 - - LOG(INFO) << "Window data layer:" << std::endl - << " foreground (object) overlap threshold: " - << this->layer_param_.window_data_param().fg_threshold() << std::endl - << " background (non-object) overlap threshold: " - << this->layer_param_.window_data_param().bg_threshold() << std::endl - << " foreground sampling fraction: " - << this->layer_param_.window_data_param().fg_fraction() << std::endl - << " cache_images: " - << this->layer_param_.window_data_param().cache_images() << std::endl - << " root_folder: " - << this->layer_param_.window_data_param().root_folder(); - - cache_images_ = this->layer_param_.window_data_param().cache_images(); - string root_folder = this->layer_param_.window_data_param().root_folder(); - - const bool prefetch_needs_rand = - this->transform_param_.mirror() || - this->transform_param_.crop_size(); - if (prefetch_needs_rand) { - const unsigned int prefetch_rng_seed = caffe_rng_rand(); - prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); - } else { - prefetch_rng_.reset(); - } - - std::ifstream infile(this->layer_param_.window_data_param().source().c_str()); - CHECK(infile.good()) << "Failed to open window file " - << this->layer_param_.window_data_param().source() << std::endl; - - map label_hist; - label_hist.insert(std::make_pair(0, 0)); - - string hashtag; - int image_index, channels; - if (!(infile >> hashtag >> image_index)) { - LOG(FATAL) << "Window file is empty"; - } - do { - CHECK_EQ(hashtag, "#"); - // read image path - string image_path; - infile >> image_path; - image_path = root_folder + image_path; - // read image dimensions - vector image_size(3); - infile >> image_size[0] >> image_size[1] >> image_size[2]; - channels = image_size[0]; - image_database_.push_back(std::make_pair(image_path, image_size)); - - if (cache_images_) { - Datum datum; - if (!ReadFileToDatum(image_path, &datum)) { - LOG(ERROR) << "Could not open or find file " << image_path; - return; - } - image_database_cache_.push_back(std::make_pair(image_path, datum)); - } - // read each box - int num_windows; - infile >> num_windows; - const float fg_threshold = - this->layer_param_.window_data_param().fg_threshold(); - const float bg_threshold = - this->layer_param_.window_data_param().bg_threshold(); - for (int i = 0; i < num_windows; ++i) { - int label, x1, y1, x2, y2; - float overlap; - infile >> label >> overlap >> x1 >> y1 >> x2 >> y2; - - vector window(WindowDataLayer::NUM); - window[WindowDataLayer::IMAGE_INDEX] = image_index; - window[WindowDataLayer::LABEL] = label; - window[WindowDataLayer::OVERLAP] = overlap; - window[WindowDataLayer::X1] = x1; - window[WindowDataLayer::Y1] = y1; - window[WindowDataLayer::X2] = x2; - window[WindowDataLayer::Y2] = y2; - - // add window to foreground list or background list - if (overlap >= fg_threshold) { - int label = window[WindowDataLayer::LABEL]; - CHECK_GT(label, 0); - fg_windows_.push_back(window); - label_hist.insert(std::make_pair(label, 0)); - label_hist[label]++; - } else if (overlap < bg_threshold) { - // background window, force label and overlap to 0 - window[WindowDataLayer::LABEL] = 0; - window[WindowDataLayer::OVERLAP] = 0; - bg_windows_.push_back(window); - label_hist[0]++; - } - } - - if (image_index % 100 == 0) { - LOG(INFO) << "num: " << image_index << " " - << image_path << " " - << image_size[0] << " " - << image_size[1] << " " - << image_size[2] << " " - << "windows to process: " << num_windows; - } - } while (infile >> hashtag >> image_index); - - LOG(INFO) << "Number of images: " << image_index+1; - - for (map::iterator it = label_hist.begin(); - it != label_hist.end(); ++it) { - LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] - << " samples"; - } - - LOG(INFO) << "Amount of context padding: " - << this->layer_param_.window_data_param().context_pad(); - - LOG(INFO) << "Crop mode: " - << this->layer_param_.window_data_param().crop_mode(); - - // image - const int crop_size = this->transform_param_.crop_size(); - CHECK_GT(crop_size, 0); - const int batch_size = this->layer_param_.window_data_param().batch_size(); - top[0]->Reshape(batch_size, channels, crop_size, crop_size); - this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); - - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - vector label_shape(1, batch_size); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); - - // data mean - has_mean_file_ = this->transform_param_.has_mean_file(); - has_mean_values_ = this->transform_param_.mean_value_size() > 0; - if (has_mean_file_) { - const string& mean_file = - this->transform_param_.mean_file(); - LOG(INFO) << "Loading mean file from: " << mean_file; - BlobProto blob_proto; - ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); - data_mean_.FromProto(blob_proto); - } - if (has_mean_values_) { - CHECK(has_mean_file_ == false) << - "Cannot specify mean_file and mean_value at the same time"; - for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { - mean_values_.push_back(this->transform_param_.mean_value(c)); - } - CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) << - "Specify either 1 mean_value or as many as channels: " << channels; - if (channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } + const vector*>& top) { + // LayerSetUp runs through the window_file and creates two structures + // that hold windows: one for foreground (object) windows and one + // for background (non-object) windows. We use an overlap threshold + // to decide which is which. + + // window_file format + // repeated: + // # image_index + // img_path (abs path) + // channels + // height + // width + // num_windows + // class_index overlap x1 y1 x2 y2 + + LOG(INFO) << "Window data layer:" << std::endl + << " foreground (object) overlap threshold: " + << this->layer_param_.window_data_param().fg_threshold() << std::endl + << " background (non-object) overlap threshold: " + << this->layer_param_.window_data_param().bg_threshold() << std::endl + << " foreground sampling fraction: " + << this->layer_param_.window_data_param().fg_fraction() << std::endl + << " cache_images: " + << this->layer_param_.window_data_param().cache_images() << std::endl + << " root_folder: " + << this->layer_param_.window_data_param().root_folder(); + + cache_images_ = this->layer_param_.window_data_param().cache_images(); + string root_folder = this->layer_param_.window_data_param().root_folder(); + + const bool prefetch_needs_rand = + this->transform_param_.mirror() || + this->transform_param_.crop_size(); + if (prefetch_needs_rand) { + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + } else { + prefetch_rng_.reset(); + } + + std::ifstream infile(this->layer_param_.window_data_param().source().c_str()); + CHECK(infile.good()) << "Failed to open window file " + << this->layer_param_.window_data_param().source() << std::endl; + + map label_hist; + label_hist.insert(std::make_pair(0, 0)); + + string hashtag; + int image_index, channels; + if (!(infile >> hashtag >> image_index)) { + LOG(FATAL) << "Window file is empty"; + } + do { + CHECK_EQ(hashtag, "#"); + // read image path + string image_path; + infile >> image_path; + image_path = root_folder + image_path; + // read image dimensions + vector image_size(3); + infile >> image_size[0] >> image_size[1] >> image_size[2]; + channels = image_size[0]; + image_database_.push_back(std::make_pair(image_path, image_size)); + + if (cache_images_) { + Datum datum; + if (!ReadFileToDatum(image_path, &datum)) { + LOG(ERROR) << "Could not open or find file " << image_path; + return; + } + image_database_cache_.push_back(std::make_pair(image_path, datum)); + } + // read each box + int num_windows; + infile >> num_windows; + const float fg_threshold = + this->layer_param_.window_data_param().fg_threshold(); + const float bg_threshold = + this->layer_param_.window_data_param().bg_threshold(); + for (int i = 0; i < num_windows; ++i) { + int label, x1, y1, x2, y2; + float overlap; + infile >> label >> overlap >> x1 >> y1 >> x2 >> y2; + + vector window(WindowDataLayer::NUM); + window[WindowDataLayer::IMAGE_INDEX] = image_index; + window[WindowDataLayer::LABEL] = label; + window[WindowDataLayer::OVERLAP] = overlap; + window[WindowDataLayer::X1] = x1; + window[WindowDataLayer::Y1] = y1; + window[WindowDataLayer::X2] = x2; + window[WindowDataLayer::Y2] = y2; + + // add window to foreground list or background list + if (overlap >= fg_threshold) { + int label = window[WindowDataLayer::LABEL]; + CHECK_GT(label, 0); + fg_windows_.push_back(window); + label_hist.insert(std::make_pair(label, 0)); + label_hist[label]++; + } else if (overlap < bg_threshold) { + // background window, force label and overlap to 0 + window[WindowDataLayer::LABEL] = 0; + window[WindowDataLayer::OVERLAP] = 0; + bg_windows_.push_back(window); + label_hist[0]++; + } + } + + if (image_index % 100 == 0) { + LOG(INFO) << "num: " << image_index << " " + << image_path << " " + << image_size[0] << " " + << image_size[1] << " " + << image_size[2] << " " + << "windows to process: " << num_windows; + } + } while (infile >> hashtag >> image_index); + + LOG(INFO) << "Number of images: " << image_index + 1; + + for (map::iterator it = label_hist.begin(); + it != label_hist.end(); ++it) { + LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] + << " samples"; + } + + LOG(INFO) << "Amount of context padding: " + << this->layer_param_.window_data_param().context_pad(); + + LOG(INFO) << "Crop mode: " + << this->layer_param_.window_data_param().crop_mode(); + + // image + const int crop_size = this->transform_param_.crop_size(); + CHECK_GT(crop_size, 0); + const int batch_size = this->layer_param_.window_data_param().batch_size(); + top[0]->Reshape(batch_size, channels, crop_size, crop_size); + this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); + + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + vector label_shape(1, batch_size); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); + + // data mean + has_mean_file_ = this->transform_param_.has_mean_file(); + has_mean_values_ = this->transform_param_.mean_value_size() > 0; + if (has_mean_file_) { + const string& mean_file = + this->transform_param_.mean_file(); + LOG(INFO) << "Loading mean file from: " << mean_file; + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + data_mean_.FromProto(blob_proto); + } + if (has_mean_values_) { + CHECK(has_mean_file_ == false) << + "Cannot specify mean_file and mean_value at the same time"; + for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { + mean_values_.push_back(this->transform_param_.mean_value(c)); + } + CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) << + "Specify either 1 mean_value or as many as channels: " << channels; + if (channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } } -template +template unsigned int WindowDataLayer::PrefetchRand() { - CHECK(prefetch_rng_); - caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); - return (*prefetch_rng)(); + CHECK (prefetch_rng_); + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + return (*prefetch_rng)(); } // Thread fetching the data -template +template void WindowDataLayer::InternalThreadEntry() { - // At each iteration, sample N windows where N*p are foreground (object) - // windows and N*(1-p) are background (non-object) windows - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); - const Dtype scale = this->layer_param_.window_data_param().scale(); - const int batch_size = this->layer_param_.window_data_param().batch_size(); - const int context_pad = this->layer_param_.window_data_param().context_pad(); - const int crop_size = this->transform_param_.crop_size(); - const bool mirror = this->transform_param_.mirror(); - const float fg_fraction = - this->layer_param_.window_data_param().fg_fraction(); - Dtype* mean = NULL; - int mean_off = 0; - int mean_width = 0; - int mean_height = 0; - if (this->has_mean_file_) { - mean = this->data_mean_.mutable_cpu_data(); - mean_off = (this->data_mean_.width() - crop_size) / 2; - mean_width = this->data_mean_.width(); - mean_height = this->data_mean_.height(); - } - cv::Size cv_crop_size(crop_size, crop_size); - const string& crop_mode = this->layer_param_.window_data_param().crop_mode(); - - bool use_square = (crop_mode == "square") ? true : false; - - // zero out batch - caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); - - const int num_fg = static_cast(static_cast(batch_size) - * fg_fraction); - const int num_samples[2] = { batch_size - num_fg, num_fg }; - - int item_id = 0; - // sample from bg set then fg set - for (int is_fg = 0; is_fg < 2; ++is_fg) { - for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) { - // sample a window - timer.Start(); - const unsigned int rand_index = PrefetchRand(); - vector window = (is_fg) ? - fg_windows_[rand_index % fg_windows_.size()] : - bg_windows_[rand_index % bg_windows_.size()]; - - bool do_mirror = mirror && PrefetchRand() % 2; - - // load the image containing the window - pair > image = - image_database_[window[WindowDataLayer::IMAGE_INDEX]]; - - cv::Mat cv_img; - if (this->cache_images_) { - pair image_cached = - image_database_cache_[window[WindowDataLayer::IMAGE_INDEX]]; - cv_img = DecodeDatumToCVMat(image_cached.second, true); - } else { - cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); - if (!cv_img.data) { - LOG(ERROR) << "Could not open or find file " << image.first; - return; - } - } - read_time += timer.MicroSeconds(); - timer.Start(); - const int channels = cv_img.channels(); - - // crop window out of image and warp it - int x1 = window[WindowDataLayer::X1]; - int y1 = window[WindowDataLayer::Y1]; - int x2 = window[WindowDataLayer::X2]; - int y2 = window[WindowDataLayer::Y2]; - - int pad_w = 0; - int pad_h = 0; - if (context_pad > 0 || use_square) { - // scale factor by which to expand the original region - // such that after warping the expanded region to crop_size x crop_size - // there's exactly context_pad amount of padding on each side - Dtype context_scale = static_cast(crop_size) / - static_cast(crop_size - 2*context_pad); - - // compute the expanded region - Dtype half_height = static_cast(y2-y1+1)/2.0; - Dtype half_width = static_cast(x2-x1+1)/2.0; - Dtype center_x = static_cast(x1) + half_width; - Dtype center_y = static_cast(y1) + half_height; - if (use_square) { - if (half_height > half_width) { - half_width = half_height; - } else { - half_height = half_width; - } - } - x1 = static_cast(round(center_x - half_width*context_scale)); - x2 = static_cast(round(center_x + half_width*context_scale)); - y1 = static_cast(round(center_y - half_height*context_scale)); - y2 = static_cast(round(center_y + half_height*context_scale)); - - // the expanded region may go outside of the image - // so we compute the clipped (expanded) region and keep track of - // the extent beyond the image - int unclipped_height = y2-y1+1; - int unclipped_width = x2-x1+1; - int pad_x1 = std::max(0, -x1); - int pad_y1 = std::max(0, -y1); - int pad_x2 = std::max(0, x2 - cv_img.cols + 1); - int pad_y2 = std::max(0, y2 - cv_img.rows + 1); - // clip bounds - x1 = x1 + pad_x1; - x2 = x2 - pad_x2; - y1 = y1 + pad_y1; - y2 = y2 - pad_y2; - CHECK_GT(x1, -1); - CHECK_GT(y1, -1); - CHECK_LT(x2, cv_img.cols); - CHECK_LT(y2, cv_img.rows); - - int clipped_height = y2-y1+1; - int clipped_width = x2-x1+1; - - // scale factors that would be used to warp the unclipped - // expanded region - Dtype scale_x = - static_cast(crop_size)/static_cast(unclipped_width); - Dtype scale_y = - static_cast(crop_size)/static_cast(unclipped_height); - - // size to warp the clipped expanded region to - cv_crop_size.width = - static_cast(round(static_cast(clipped_width)*scale_x)); - cv_crop_size.height = - static_cast(round(static_cast(clipped_height)*scale_y)); - pad_x1 = static_cast(round(static_cast(pad_x1)*scale_x)); - pad_x2 = static_cast(round(static_cast(pad_x2)*scale_x)); - pad_y1 = static_cast(round(static_cast(pad_y1)*scale_y)); - pad_y2 = static_cast(round(static_cast(pad_y2)*scale_y)); - - pad_h = pad_y1; - // if we're mirroring, we mirror the padding too (to be pedantic) - if (do_mirror) { - pad_w = pad_x2; - } else { - pad_w = pad_x1; - } - - // ensure that the warped, clipped region plus the padding fits in the - // crop_size x crop_size image (it might not due to rounding) - if (pad_h + cv_crop_size.height > crop_size) { - cv_crop_size.height = crop_size - pad_h; - } - if (pad_w + cv_crop_size.width > crop_size) { - cv_crop_size.width = crop_size - pad_w; - } - } - - cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1); - cv::Mat cv_cropped_img = cv_img(roi); - cv::resize(cv_cropped_img, cv_cropped_img, - cv_crop_size, 0, 0, cv::INTER_LINEAR); - - // horizontal flip at random - if (do_mirror) { - cv::flip(cv_cropped_img, cv_cropped_img, 1); - } - - // copy the warped window into top_data - for (int h = 0; h < cv_cropped_img.rows; ++h) { - const uchar* ptr = cv_cropped_img.ptr(h); - int img_index = 0; - for (int w = 0; w < cv_cropped_img.cols; ++w) { - for (int c = 0; c < channels; ++c) { - int top_index = ((item_id * channels + c) * crop_size + h + pad_h) - * crop_size + w + pad_w; - // int top_index = (c * height + h) * width + w; - Dtype pixel = static_cast(ptr[img_index++]); - if (this->has_mean_file_) { - int mean_index = (c * mean_height + h + mean_off + pad_h) - * mean_width + w + mean_off + pad_w; - top_data[top_index] = (pixel - mean[mean_index]) * scale; - } else { - if (this->has_mean_values_) { - top_data[top_index] = (pixel - this->mean_values_[c]) * scale; - } else { - top_data[top_index] = pixel * scale; - } - } - } - } - } - trans_time += timer.MicroSeconds(); - // get window label - top_label[item_id] = window[WindowDataLayer::LABEL]; - - item_id++; - } - } - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + // At each iteration, sample N windows where N*p are foreground (object) + // windows and N*(1-p) are background (non-object) windows + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); + const Dtype scale = this->layer_param_.window_data_param().scale(); + const int batch_size = this->layer_param_.window_data_param().batch_size(); + const int context_pad = this->layer_param_.window_data_param().context_pad(); + const int crop_size = this->transform_param_.crop_size(); + const bool mirror = this->transform_param_.mirror(); + const float fg_fraction = + this->layer_param_.window_data_param().fg_fraction(); + Dtype* mean = NULL; + int mean_off = 0; + int mean_width = 0; + int mean_height = 0; + if (this->has_mean_file_) { + mean = this->data_mean_.mutable_cpu_data(); + mean_off = (this->data_mean_.width() - crop_size) / 2; + mean_width = this->data_mean_.width(); + mean_height = this->data_mean_.height(); + } + cv::Size cv_crop_size(crop_size, crop_size); + const string& crop_mode = this->layer_param_.window_data_param().crop_mode(); + + bool use_square = (crop_mode == "square") ? true : false; + + // zero out batch + caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); + + const int num_fg = static_cast(static_cast(batch_size) + * fg_fraction); + const int num_samples[2] = { batch_size - num_fg, num_fg }; + + int item_id = 0; + // sample from bg set then fg set + for (int is_fg = 0; is_fg < 2; ++is_fg) { + for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) { + // sample a window + timer.Start(); + const unsigned int rand_index = PrefetchRand(); + vector window = + (is_fg) ? + fg_windows_[rand_index % fg_windows_.size()] : + bg_windows_[rand_index % bg_windows_.size()]; + + bool do_mirror = mirror && PrefetchRand() % 2; + + // load the image containing the window + pair > image = + image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + + cv::Mat cv_img; + if (this->cache_images_) { + pair < std::string, Datum > image_cached = + image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + cv_img = DecodeDatumToCVMat(image_cached.second, true); + } else { + cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); + if (!cv_img.data) { + LOG(ERROR) << "Could not open or find file " << image.first; + return; + } + } + read_time += timer.MicroSeconds(); + timer.Start(); + const int channels = cv_img.channels(); + + // crop window out of image and warp it + int x1 = window[WindowDataLayer < Dtype > ::X1]; + int y1 = window[WindowDataLayer < Dtype > ::Y1]; + int x2 = window[WindowDataLayer < Dtype > ::X2]; + int y2 = window[WindowDataLayer < Dtype > ::Y2]; + + int pad_w = 0; + int pad_h = 0; + if (context_pad > 0 || use_square) { + // scale factor by which to expand the original region + // such that after warping the expanded region to crop_size x crop_size + // there's exactly context_pad amount of padding on each side + Dtype context_scale = static_cast(crop_size) / + static_cast(crop_size - 2 * context_pad); + + // compute the expanded region + Dtype half_height = static_cast(y2 - y1 + 1) / 2.0; + Dtype half_width = static_cast(x2 - x1 + 1) / 2.0; + Dtype center_x = static_cast(x1) + half_width; + Dtype center_y = static_cast(y1) + half_height; + if (use_square) { + if (half_height > half_width) { + half_width = half_height; + } else { + half_height = half_width; + } + } + x1 = static_cast(round(center_x - half_width * context_scale)); + x2 = static_cast(round(center_x + half_width * context_scale)); + y1 = static_cast(round(center_y - half_height * context_scale)); + y2 = static_cast(round(center_y + half_height * context_scale)); + + // the expanded region may go outside of the image + // so we compute the clipped (expanded) region and keep track of + // the extent beyond the image + int unclipped_height = y2 - y1 + 1; + int unclipped_width = x2 - x1 + 1; + int pad_x1 = std::max(0, -x1); + int pad_y1 = std::max(0, -y1); + int pad_x2 = std::max(0, x2 - cv_img.cols + 1); + int pad_y2 = std::max(0, y2 - cv_img.rows + 1); + // clip bounds + x1 = x1 + pad_x1; + x2 = x2 - pad_x2; + y1 = y1 + pad_y1; + y2 = y2 - pad_y2; + CHECK_GT(x1, -1); + CHECK_GT(y1, -1); + CHECK_LT(x2, cv_img.cols); + CHECK_LT(y2, cv_img.rows); + + int clipped_height = y2 - y1 + 1; + int clipped_width = x2 - x1 + 1; + + // scale factors that would be used to warp the unclipped + // expanded region + Dtype scale_x = + static_cast(crop_size) / static_cast(unclipped_width); + Dtype scale_y = + static_cast(crop_size) / static_cast(unclipped_height); + + // size to warp the clipped expanded region to + cv_crop_size.width = + static_cast(round(static_cast(clipped_width) * scale_x)); + cv_crop_size.height = + static_cast(round(static_cast(clipped_height) * scale_y)); + pad_x1 = static_cast(round(static_cast(pad_x1) * scale_x)); + pad_x2 = static_cast(round(static_cast(pad_x2) * scale_x)); + pad_y1 = static_cast(round(static_cast(pad_y1) * scale_y)); + pad_y2 = static_cast(round(static_cast(pad_y2) * scale_y)); + + pad_h = pad_y1; + // if we're mirroring, we mirror the padding too (to be pedantic) + if (do_mirror) { + pad_w = pad_x2; + } else { + pad_w = pad_x1; + } + + // ensure that the warped, clipped region plus the padding fits in the + // crop_size x crop_size image (it might not due to rounding) + if (pad_h + cv_crop_size.height > crop_size) { + cv_crop_size.height = crop_size - pad_h; + } + if (pad_w + cv_crop_size.width > crop_size) { + cv_crop_size.width = crop_size - pad_w; + } + } + + cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1); + cv::Mat cv_cropped_img = cv_img(roi); + cv::resize(cv_cropped_img, cv_cropped_img, + cv_crop_size, 0, 0, cv::INTER_LINEAR); + + // horizontal flip at random + if (do_mirror) { + cv::flip(cv_cropped_img, cv_cropped_img, 1); + } + + // copy the warped window into top_data + for (int h = 0; h < cv_cropped_img.rows; ++h) { + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < cv_cropped_img.cols; ++w) { + for (int c = 0; c < channels; ++c) { + int top_index = ((item_id * channels + c) * crop_size + h + pad_h) + * crop_size + w + pad_w; + // int top_index = (c * height + h) * width + w; + Dtype pixel = static_cast(ptr[img_index++]); + if (this->has_mean_file_) { + int mean_index = (c * mean_height + h + mean_off + pad_h) + * mean_width + w + mean_off + pad_w; + top_data[top_index] = (pixel - mean[mean_index]) * scale; + } else { + if (this->has_mean_values_) { + top_data[top_index] = (pixel - this->mean_values_[c]) * scale; + } else { + top_data[top_index] = pixel * scale; + } + } + } + } + } + trans_time += timer.MicroSeconds(); + // get window label + top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL]; + + item_id++; + } + } + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(WindowDataLayer); -REGISTER_LAYER_CLASS(WindowData); +INSTANTIATE_CLASS (WindowDataLayer); +REGISTER_LAYER_CLASS (WindowData); } // namespace caffe diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index e070d774..53ec5461 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -19,863 +19,901 @@ namespace caffe { -template +template Net::Net(const NetParameter& param) { - Init(param); + Init(param); } -template +template Net::Net(const string& param_file, Phase phase) { - NetParameter param; - ReadNetParamsFromTextFileOrDie(param_file, ¶m); - param.mutable_state()->set_phase(phase); - Init(param); + NetParameter param; + ReadNetParamsFromTextFileOrDie(param_file, ¶m); + param.mutable_state()->set_phase(phase); + Init(param); } -template +template void Net::Init(const NetParameter& in_param) { - // Set phase from the state. - phase_ = in_param.state().phase(); - // Filter layers based on their include/exclude rules and - // the current NetState. - NetParameter filtered_param; - FilterNet(in_param, &filtered_param); - LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); - // Create a copy of filtered_param with splits added where necessary. - NetParameter param; - InsertSplits(filtered_param, ¶m); - // Basically, build all the layers and set up their connections. - name_ = param.name(); - map blob_name_to_idx; - set available_blobs; - CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) - << "Must specify either input_shape OR deprecated input_dim, not both."; - if (param.input_dim_size() > 0) { - // Deprecated 4D dimensions. - CHECK_EQ(param.input_size() * 4, param.input_dim_size()) - << "Incorrect input blob dimension specifications."; - } else { - CHECK_EQ(param.input_size(), param.input_shape_size()) - << "Exactly one input_shape must be specified per input."; - } - memory_used_ = 0; - // set the input blobs - for (int input_id = 0; input_id < param.input_size(); ++input_id) { - const int layer_id = -1; // inputs have fake layer ID -1 - AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - // For each layer, set up its input and output - bottom_vecs_.resize(param.layer_size()); - top_vecs_.resize(param.layer_size()); - bottom_id_vecs_.resize(param.layer_size()); - param_id_vecs_.resize(param.layer_size()); - top_id_vecs_.resize(param.layer_size()); - bottom_need_backward_.resize(param.layer_size()); - for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { - // Inherit phase from net if unset. - if (!param.layer(layer_id).has_phase()) { - param.mutable_layer(layer_id)->set_phase(phase_); - } - // Setup layer. - const LayerParameter& layer_param = param.layer(layer_id); - if (layer_param.propagate_down_size() > 0) { - CHECK_EQ(layer_param.propagate_down_size(), - layer_param.bottom_size()) - << "propagate_down param must be specified " - << "either 0 or bottom_size times "; - } - layers_.push_back(LayerRegistry::CreateLayer(layer_param)); - layer_names_.push_back(layer_param.name()); - LOG(INFO) << "Creating Layer " << layer_param.name(); - bool need_backward = false; - - // Figure out this layer's input and output - for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); - ++bottom_id) { - const int blob_id = AppendBottom(param, layer_id, bottom_id, - &available_blobs, &blob_name_to_idx); - // If a blob needs backward, this layer should provide it. - need_backward |= blob_need_backward_[blob_id]; - } - int num_top = layer_param.top_size(); - for (int top_id = 0; top_id < num_top; ++top_id) { - AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); - } - // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter - // specified fewer than the required number (as specified by - // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. - Layer* layer = layers_[layer_id].get(); - if (layer->AutoTopBlobs()) { - const int needed_num_top = - std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); - for (; num_top < needed_num_top; ++num_top) { - // Add "anonymous" top blobs -- do not modify available_blobs or - // blob_name_to_idx as we don't want these blobs to be usable as input - // to other layers. - AppendTop(param, layer_id, num_top, NULL, NULL); - } - } - // After this layer is connected, set it up. - LOG(INFO) << "Setting up " << layer_names_[layer_id]; - layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { - blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); - } - blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); - LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); - if (layer->loss(top_id)) { - LOG(INFO) << " with loss weight " << layer->loss(top_id); - } - memory_used_ += top_vecs_[layer_id][top_id]->count(); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - const int param_size = layer_param.param_size(); - const int num_param_blobs = layers_[layer_id]->blobs().size(); - CHECK_LE(param_size, num_param_blobs) - << "Too many params specified for layer " << layer_param.name(); - ParamSpec default_param_spec; - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - const ParamSpec* param_spec = (param_id < param_size) ? - &layer_param.param(param_id) : &default_param_spec; - const bool param_need_backward = param_spec->lr_mult() > 0; - need_backward |= param_need_backward; - layers_[layer_id]->set_param_propagate_down(param_id, - param_need_backward); - } - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - AppendParam(param, layer_id, param_id); - } - // Finally, set the backward flag - layer_need_backward_.push_back(need_backward); - if (need_backward) { - for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { - blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; - } - } - } - // Go through the net backwards to determine which blobs contribute to the - // loss. We can skip backward computation for blobs that don't contribute - // to the loss. - // Also checks if all bottom blobs don't need backward computation (possible - // because the skip_propagate_down param) and so we can skip bacward - // computation for the entire layer - set blobs_under_loss; - set blobs_skip_backp; - for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { - bool layer_contributes_loss = false; - bool layer_skip_propagate_down = true; - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - if (layers_[layer_id]->loss(top_id) || - (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { - layer_contributes_loss = true; - } - if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { - layer_skip_propagate_down = false; - } - if (layer_contributes_loss && !layer_skip_propagate_down) - break; - } - // If this layer can skip backward computation, also all his bottom blobs - // don't need backpropagation - if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { - layer_need_backward_[layer_id] = false; - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = false; - } - } - if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; } - if (layer_need_backward_[layer_id]) { - LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; - } else { - LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; - } - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - if (layer_contributes_loss) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_under_loss.insert(blob_name); - } else { - bottom_need_backward_[layer_id][bottom_id] = false; - } - if (!bottom_need_backward_[layer_id][bottom_id]) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_skip_backp.insert(blob_name); - } - } - } - // Handle force_backward if needed. - if (param.force_backward()) { - for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { - layer_need_backward_[layer_id] = true; - for (int bottom_id = 0; - bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = - bottom_need_backward_[layer_id][bottom_id] || - layers_[layer_id]->AllowForceBackward(bottom_id); - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || - bottom_need_backward_[layer_id][bottom_id]; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - layers_[layer_id]->set_param_propagate_down(param_id, true); - } - } - } - // In the end, all remaining blobs are considered output blobs. - for (set::iterator it = available_blobs.begin(); - it != available_blobs.end(); ++it) { - LOG(INFO) << "This network produces output " << *it; - net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); - net_output_blob_indices_.push_back(blob_name_to_idx[*it]); - } - for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { - blob_names_index_[blob_names_[blob_id]] = blob_id; - } - for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { - layer_names_index_[layer_names_[layer_id]] = layer_id; - } - GetLearningRateAndWeightDecay(); - debug_info_ = param.debug_info(); - LOG(INFO) << "Network initialization done."; - LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); -} - -template + // Set phase from the state. + phase_ = in_param.state().phase(); + // Filter layers based on their include/exclude rules and + // the current NetState. + NetParameter filtered_param; + FilterNet(in_param, &filtered_param); + LOG(INFO) << "Initializing net from parameters: " << std::endl + << filtered_param.DebugString(); + // Create a copy of filtered_param with splits added where necessary. + NetParameter param; + InsertSplits(filtered_param, ¶m); + // Basically, build all the layers and set up their connections. + name_ = param.name(); + map blob_name_to_idx; + set < string > available_blobs; + CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) + << "Must specify either input_shape OR deprecated input_dim, not both."; + if (param.input_dim_size() > 0) { + // Deprecated 4D dimensions. + CHECK_EQ(param.input_size() * 4, param.input_dim_size()) + << "Incorrect input blob dimension specifications."; + } else { + CHECK_EQ(param.input_size(), param.input_shape_size()) + << "Exactly one input_shape must be specified per input."; + } + memory_used_ = 0; + // set the input blobs + for (int input_id = 0; input_id < param.input_size(); ++input_id) { + const int layer_id = -1; // inputs have fake layer ID -1 + AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + // For each layer, set up its input and output + bottom_vecs_.resize(param.layer_size()); + top_vecs_.resize(param.layer_size()); + bottom_id_vecs_.resize(param.layer_size()); + param_id_vecs_.resize(param.layer_size()); + top_id_vecs_.resize(param.layer_size()); + bottom_need_backward_.resize(param.layer_size()); + for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { + // Inherit phase from net if unset. + if (!param.layer(layer_id).has_phase()) { + param.mutable_layer(layer_id)->set_phase(phase_); + } + // Setup layer. + const LayerParameter& layer_param = param.layer(layer_id); + if (layer_param.propagate_down_size() > 0) { + CHECK_EQ(layer_param.propagate_down_size(), + layer_param.bottom_size()) + << "propagate_down param must be specified " + << "either 0 or bottom_size times "; + } + layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param)); + layer_names_.push_back(layer_param.name()); + LOG(INFO) << "Creating Layer " << layer_param.name(); + bool need_backward = false; + + // Figure out this layer's input and output + for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); + ++bottom_id) { + const int blob_id = AppendBottom(param, layer_id, bottom_id, + &available_blobs, &blob_name_to_idx); + // If a blob needs backward, this layer should provide it. + need_backward |= blob_need_backward_[blob_id]; + } + int num_top = layer_param.top_size(); + for (int top_id = 0; top_id < num_top; ++top_id) { + AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); + } + // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter + // specified fewer than the required number (as specified by + // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. + Layer < Dtype > *layer = layers_[layer_id].get(); + if (layer->AutoTopBlobs()) { + const int needed_num_top = + std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); + for (; num_top < needed_num_top; ++num_top) { + // Add "anonymous" top blobs -- do not modify available_blobs or + // blob_name_to_idx as we don't want these blobs to be usable as input + // to other layers. + AppendTop(param, layer_id, num_top, NULL, NULL); + } + } + // After this layer is connected, set it up. + LOG(INFO) << "Setting up " << layer_names_[layer_id]; + layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { + blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); + } + blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); + LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); + if (layer->loss(top_id)) { + LOG(INFO) << " with loss weight " << layer->loss(top_id); + } + memory_used_ += top_vecs_[layer_id][top_id]->count(); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + const int param_size = layer_param.param_size(); + const int num_param_blobs = layers_[layer_id]->blobs().size(); + CHECK_LE(param_size, num_param_blobs) + << "Too many params specified for layer " << layer_param.name(); + ParamSpec default_param_spec; + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + const ParamSpec* param_spec = + (param_id < param_size) ? + &layer_param.param(param_id) : + &default_param_spec; + const bool param_need_backward = param_spec->lr_mult() > 0; + need_backward |= param_need_backward; + layers_[layer_id]->set_param_propagate_down(param_id, + param_need_backward); + } + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + AppendParam(param, layer_id, param_id); + } + // Finally, set the backward flag + layer_need_backward_.push_back(need_backward); + if (need_backward) { + for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { + blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; + } + } + } + // Go through the net backwards to determine which blobs contribute to the + // loss. We can skip backward computation for blobs that don't contribute + // to the loss. + // Also checks if all bottom blobs don't need backward computation (possible + // because the skip_propagate_down param) and so we can skip bacward + // computation for the entire layer + set < string > blobs_under_loss; + set < string > blobs_skip_backp; + for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { + bool layer_contributes_loss = false; + bool layer_skip_propagate_down = true; + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + if (layers_[layer_id]->loss(top_id) || + (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { + layer_contributes_loss = true; + } + if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { + layer_skip_propagate_down = false; + } + if (layer_contributes_loss && !layer_skip_propagate_down) + break; + } + // If this layer can skip backward computation, also all his bottom blobs + // don't need backpropagation + if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { + layer_need_backward_[layer_id] = false; + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = false; + } + } + if (!layer_contributes_loss) { + layer_need_backward_[layer_id] = false; + } + if (layer_need_backward_[layer_id]) { + LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; + } else { + LOG(INFO) << layer_names_[layer_id] + << " does not need backward computation."; + } + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + if (layer_contributes_loss) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_under_loss.insert(blob_name); + } else { + bottom_need_backward_[layer_id][bottom_id] = false; + } + if (!bottom_need_backward_[layer_id][bottom_id]) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_skip_backp.insert(blob_name); + } + } + } + // Handle force_backward if needed. + if (param.force_backward()) { + for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { + layer_need_backward_[layer_id] = true; + for (int bottom_id = 0; + bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = + bottom_need_backward_[layer_id][bottom_id] || + layers_[layer_id]->AllowForceBackward(bottom_id); + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || + bottom_need_backward_[layer_id][bottom_id]; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + layers_[layer_id]->set_param_propagate_down(param_id, true); + } + } + } + // In the end, all remaining blobs are considered output blobs. + for (set::iterator it = available_blobs.begin(); + it != available_blobs.end(); ++it) { + LOG(INFO) << "This network produces output " << *it; + net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); + net_output_blob_indices_.push_back(blob_name_to_idx[*it]); + } + for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { + blob_names_index_[blob_names_[blob_id]] = blob_id; + } + for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { + layer_names_index_[layer_names_[layer_id]] = layer_id; + } + GetLearningRateAndWeightDecay(); + debug_info_ = param.debug_info(); + LOG(INFO) << "Network initialization done."; + LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); +} + +template void Net::FilterNet(const NetParameter& param, - NetParameter* param_filtered) { - NetState net_state(param.state()); - param_filtered->CopyFrom(param); - param_filtered->clear_layer(); - for (int i = 0; i < param.layer_size(); ++i) { - const LayerParameter& layer_param = param.layer(i); - const string& layer_name = layer_param.name(); - CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) - << "Specify either include rules or exclude rules; not both."; - // If no include rules are specified, the layer is included by default and - // only excluded if it meets one of the exclude rules. - bool layer_included = (layer_param.include_size() == 0); - for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { - layer_included = false; - } - } - for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { - layer_included = true; - } - } - if (layer_included) { - param_filtered->add_layer()->CopyFrom(layer_param); - } - } -} - -template + NetParameter* param_filtered) { + NetState net_state(param.state()); + param_filtered->CopyFrom(param); + param_filtered->clear_layer(); + for (int i = 0; i < param.layer_size(); ++i) { + const LayerParameter& layer_param = param.layer(i); + const string& layer_name = layer_param.name(); + CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) + << "Specify either include rules or exclude rules; not both."; + // If no include rules are specified, the layer is included by default and + // only excluded if it meets one of the exclude rules. + bool layer_included = (layer_param.include_size() == 0); + for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { + layer_included = false; + } + } + for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { + layer_included = true; + } + } + if (layer_included) { + param_filtered->add_layer()->CopyFrom(layer_param); + } + } +} + +template bool Net::StateMeetsRule(const NetState& state, - const NetStateRule& rule, const string& layer_name) { - // Check whether the rule is broken due to phase. - if (rule.has_phase()) { - if (rule.phase() != state.phase()) { - LOG(INFO) << "The NetState phase (" << state.phase() - << ") differed from the phase (" << rule.phase() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to min level. - if (rule.has_min_level()) { - if (state.level() < rule.min_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the min_level (" << rule.min_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to max level. - if (rule.has_max_level()) { - if (state.level() > rule.max_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the max_level (" << rule.max_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to stage. The NetState must - // contain ALL of the rule's stages to meet it. - for (int i = 0; i < rule.stage_size(); ++i) { - // Check that the NetState contains the rule's ith stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.stage(i) == state.stage(j)) { has_stage = true; } - } - if (!has_stage) { - LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to not_stage. The NetState must - // contain NONE of the rule's not_stages to meet it. - for (int i = 0; i < rule.not_stage_size(); ++i) { - // Check that the NetState contains the rule's ith not_stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.not_stage(i) == state.stage(j)) { has_stage = true; } - } - if (has_stage) { - LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - return true; + const NetStateRule& rule, const string& layer_name) { + // Check whether the rule is broken due to phase. + if (rule.has_phase()) { + if (rule.phase() != state.phase()) { + LOG(INFO) << "The NetState phase (" << state.phase() + << ") differed from the phase (" << rule.phase() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to min level. + if (rule.has_min_level()) { + if (state.level() < rule.min_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the min_level (" << rule.min_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to max level. + if (rule.has_max_level()) { + if (state.level() > rule.max_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the max_level (" << rule.max_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to stage. The NetState must + // contain ALL of the rule's stages to meet it. + for (int i = 0; i < rule.stage_size(); ++i) { + // Check that the NetState contains the rule's ith stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.stage(i) == state.stage(j)) { + has_stage = true; + } + } + if (!has_stage) { + LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to not_stage. The NetState must + // contain NONE of the rule's not_stages to meet it. + for (int i = 0; i < rule.not_stage_size(); ++i) { + // Check that the NetState contains the rule's ith not_stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.not_stage(i) == state.stage(j)) { + has_stage = true; + } + } + if (has_stage) { + LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + return true; } // Helper for Net::Init: add a new input or top blob to the net. (Inputs have // layer_id == -1, tops have layer_id >= 0.) -template +template void Net::AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx) { - shared_ptr layer_param((layer_id >= 0) ? - (new LayerParameter(param.layer(layer_id))) : NULL); - const string& blob_name = layer_param ? - (layer_param->top_size() > top_id ? - layer_param->top(top_id) : "(automatic)") : param.input(top_id); - // Check if we are doing in-place computation - if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && - blob_name == layer_param->bottom(top_id)) { - // In-place computation - LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; - top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); - top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); - } else if (blob_name_to_idx && - blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { - // If we are not doing in-place computation but have duplicated blobs, - // raise an error. - LOG(FATAL) << "Duplicate blobs produced by multiple sources."; - } else { - // Normal output. - if (layer_param) { - LOG(INFO) << layer_param->name() << " -> " << blob_name; - } else { - LOG(INFO) << "Input " << top_id << " -> " << blob_name; - } - shared_ptr > blob_pointer(new Blob()); - const int blob_id = blobs_.size(); - blobs_.push_back(blob_pointer); - blob_names_.push_back(blob_name); - blob_need_backward_.push_back(false); - if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; } - if (layer_id == -1) { - // Set the (explicitly specified) dimensions of the input blob. - if (param.input_dim_size() > 0) { - blob_pointer->Reshape(param.input_dim(top_id * 4), - param.input_dim(top_id * 4 + 1), - param.input_dim(top_id * 4 + 2), - param.input_dim(top_id * 4 + 3)); - } else { - blob_pointer->Reshape(param.input_shape(top_id)); - } - net_input_blob_indices_.push_back(blob_id); - net_input_blobs_.push_back(blob_pointer.get()); - } else { - top_id_vecs_[layer_id].push_back(blob_id); - top_vecs_[layer_id].push_back(blob_pointer.get()); - } - } - if (available_blobs) { available_blobs->insert(blob_name); } + const int top_id, set* available_blobs, + map* blob_name_to_idx) { + shared_ptr < LayerParameter + > layer_param( + (layer_id >= 0) ? + (new LayerParameter(param.layer(layer_id))) : + NULL); + const string& blob_name = + layer_param ? + (layer_param->top_size() > top_id ? + layer_param->top(top_id) : + "(automatic)") : + param.input(top_id); + // Check if we are doing in-place computation + if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && + blob_name == layer_param->bottom(top_id)) { + // In-place computation + LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; + top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); + top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); + } else if (blob_name_to_idx && + blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { + // If we are not doing in-place computation but have duplicated blobs, + // raise an error. + LOG(FATAL) << "Duplicate blobs produced by multiple sources."; + } else { + // Normal output. + if (layer_param) { + LOG(INFO) << layer_param->name() << " -> " << blob_name; + } else { + LOG(INFO) << "Input " << top_id << " -> " << blob_name; + } + shared_ptr < Blob > blob_pointer(new Blob()); + const int blob_id = blobs_.size(); + blobs_.push_back(blob_pointer); + blob_names_.push_back(blob_name); + blob_need_backward_.push_back(false); + if (blob_name_to_idx) { + (*blob_name_to_idx)[blob_name] = blob_id; + } + if (layer_id == -1) { + // Set the (explicitly specified) dimensions of the input blob. + if (param.input_dim_size() > 0) { + blob_pointer->Reshape(param.input_dim(top_id * 4), + param.input_dim(top_id * 4 + 1), + param.input_dim(top_id * 4 + 2), + param.input_dim(top_id * 4 + 3)); + } else { + blob_pointer->Reshape(param.input_shape(top_id)); + } + net_input_blob_indices_.push_back(blob_id); + net_input_blobs_.push_back(blob_pointer.get()); + } else { + top_id_vecs_[layer_id].push_back(blob_id); + top_vecs_[layer_id].push_back(blob_pointer.get()); + } + } + if (available_blobs) { + available_blobs->insert(blob_name); + } } // Helper for Net::Init: add a new bottom blob to the net. -template +template int Net::AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx) { - const LayerParameter& layer_param = param.layer(layer_id); - const string& blob_name = layer_param.bottom(bottom_id); - if (available_blobs->find(blob_name) == available_blobs->end()) { - LOG(FATAL) << "Unknown blob input " << blob_name - << " (at index " << bottom_id << ") to layer " << layer_id; - } - const int blob_id = (*blob_name_to_idx)[blob_name]; - LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; - bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); - bottom_id_vecs_[layer_id].push_back(blob_id); - available_blobs->erase(blob_name); - bool propagate_down = true; - // Check if the backpropagation on bottom_id should be skipped - if (layer_param.propagate_down_size() > 0) - propagate_down = layer_param.propagate_down(bottom_id); - const bool need_backward = blob_need_backward_[blob_id] && - propagate_down; - bottom_need_backward_[layer_id].push_back(need_backward); - return blob_id; -} - -template + const int bottom_id, set* available_blobs, + map* blob_name_to_idx) { + const LayerParameter& layer_param = param.layer(layer_id); + const string& blob_name = layer_param.bottom(bottom_id); + if (available_blobs->find(blob_name) == available_blobs->end()) { + LOG(FATAL) << "Unknown blob input " << blob_name + << " (at index " << bottom_id << ") to layer " << layer_id; + } + const int blob_id = (*blob_name_to_idx)[blob_name]; + LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; + bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); + bottom_id_vecs_[layer_id].push_back(blob_id); + available_blobs->erase(blob_name); + bool propagate_down = true; + // Check if the backpropagation on bottom_id should be skipped + if (layer_param.propagate_down_size() > 0) + propagate_down = layer_param.propagate_down(bottom_id); + const bool need_backward = blob_need_backward_[blob_id] && + propagate_down; + bottom_need_backward_[layer_id].push_back(need_backward); + return blob_id; +} + +template void Net::AppendParam(const NetParameter& param, const int layer_id, - const int param_id) { - const LayerParameter& layer_param = layers_[layer_id]->layer_param(); - const int param_size = layer_param.param_size(); - string param_name = - (param_size > param_id) ? layer_param.param(param_id).name() : ""; - if (param_name.size()) { - param_display_names_.push_back(param_name); - } else { - ostringstream param_display_name; - param_display_name << param_id; - param_display_names_.push_back(param_display_name.str()); - } - const int net_param_id = params_.size(); - params_.push_back(layers_[layer_id]->blobs()[param_id]); - param_id_vecs_[layer_id].push_back(net_param_id); - param_layer_indices_.push_back(make_pair(layer_id, param_id)); - if (!param_size || !param_name.size() || (param_name.size() && - param_names_index_.find(param_name) == param_names_index_.end())) { - // This layer "owns" this parameter blob -- it is either anonymous - // (i.e., not given a param_name) or explicitly given a name that we - // haven't already seen. - param_owners_.push_back(-1); - if (param_name.size()) { - param_names_index_[param_name] = net_param_id; - } - } else { - // Named param blob with name we've seen before: share params - const int owner_net_param_id = param_names_index_[param_name]; - param_owners_.push_back(owner_net_param_id); - const pair& owner_index = - param_layer_indices_[owner_net_param_id]; - const int owner_layer_id = owner_index.first; - const int owner_param_id = owner_index.second; - LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; - Blob* this_blob = layers_[layer_id]->blobs()[param_id].get(); - Blob* owner_blob = - layers_[owner_layer_id]->blobs()[owner_param_id].get(); - const int param_size = layer_param.param_size(); - if (param_size > param_id && (layer_param.param(param_id).share_mode() == - ParamSpec_DimCheckMode_PERMISSIVE)) { - // Permissive dimension checking -- only check counts are the same. - CHECK_EQ(this_blob->count(), owner_blob->count()) - << "Shared parameter blobs must have the same count."; - } else { - // Strict dimension checking -- all dims must be the same. - CHECK(this_blob->shape() == owner_blob->shape()); - } - layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); - } -} - -template + const int param_id) { + const LayerParameter& layer_param = layers_[layer_id]->layer_param(); + const int param_size = layer_param.param_size(); + string param_name = + (param_size > param_id) ? layer_param.param(param_id).name() : ""; + if (param_name.size()) { + param_display_names_.push_back(param_name); + } else { + ostringstream param_display_name; + param_display_name << param_id; + param_display_names_.push_back(param_display_name.str()); + } + const int net_param_id = params_.size(); + params_.push_back(layers_[layer_id]->blobs()[param_id]); + param_id_vecs_[layer_id].push_back(net_param_id); + param_layer_indices_.push_back(make_pair(layer_id, param_id)); + if (!param_size || !param_name.size() || (param_name.size() && + param_names_index_.find(param_name) == param_names_index_.end())) { + // This layer "owns" this parameter blob -- it is either anonymous + // (i.e., not given a param_name) or explicitly given a name that we + // haven't already seen. + param_owners_.push_back(-1); + if (param_name.size()) { + param_names_index_[param_name] = net_param_id; + } + } else { + // Named param blob with name we've seen before: share params + const int owner_net_param_id = param_names_index_[param_name]; + param_owners_.push_back(owner_net_param_id); + const pair& owner_index = + param_layer_indices_[owner_net_param_id]; + const int owner_layer_id = owner_index.first; + const int owner_param_id = owner_index.second; + LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " + << "layer '" << layer_names_[owner_layer_id] << "', param " + << "index " << owner_param_id; + Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get(); + Blob < Dtype > *owner_blob = + layers_[owner_layer_id]->blobs()[owner_param_id].get(); + const int param_size = layer_param.param_size(); + if (param_size > param_id && (layer_param.param(param_id).share_mode() == + ParamSpec_DimCheckMode_PERMISSIVE)) { + // Permissive dimension checking -- only check counts are the same. + CHECK_EQ(this_blob->count(), owner_blob->count()) + << "Shared parameter blobs must have the same count."; + } else { + // Strict dimension checking -- all dims must be the same. + CHECK(this_blob->shape() == owner_blob->shape()); + } + layers_[layer_id]->blobs()[param_id]->ShareData( + *layers_[owner_layer_id]->blobs()[owner_param_id]); + } +} + +template void Net::GetLearningRateAndWeightDecay() { - LOG(INFO) << "Collecting Learning Rate and Weight Decay."; - ParamSpec default_param_spec; - for (int i = 0; i < layers_.size(); ++i) { - vector > >& layer_blobs = layers_[i]->blobs(); - for (int j = 0; j < layer_blobs.size(); ++j) { - const ParamSpec* param_spec = - (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; - params_lr_.push_back(param_spec->lr_mult()); - params_weight_decay_.push_back(param_spec->decay_mult()); - } - } -} - -template + LOG(INFO) << "Collecting Learning Rate and Weight Decay."; + ParamSpec default_param_spec; + for (int i = 0; i < layers_.size(); ++i) { + vector < shared_ptr > > &layer_blobs = layers_[i]->blobs(); + for (int j = 0; j < layer_blobs.size(); ++j) { + const ParamSpec* param_spec = + (layers_[i]->layer_param().param_size() > j) ? + &layers_[i]->layer_param().param(j) : &default_param_spec; + params_lr_.push_back(param_spec->lr_mult()); + params_weight_decay_.push_back(param_spec->decay_mult()); + } + } +} + +template Dtype Net::ForwardFromTo(int start, int end) { - CHECK_GE(start, 0); - CHECK_LT(end, layers_.size()); - Dtype loss = 0; - if (debug_info_) { - for (int i = 0; i < net_input_blobs_.size(); ++i) { - InputDebugInfo(i); - } - } - - CPUTimer forward_timer; - CPUTimer layer_timer; - forward_timer.Start(); - - for (int i = start; i <= end; ++i) { - layer_timer.Start(); - Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); - loss += layer_loss; - if (debug_info_) { ForwardDebugInfo(i); } - clFinish(amdDevice.CommandQueue); - layer_timer.Stop(); - printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); - } - - forward_timer.Stop(); - printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); - - return loss; -} - -template + CHECK_GE(start, 0); + CHECK_LT(end, layers_.size()); + Dtype loss = 0; + if (debug_info_) { + for (int i = 0; i < net_input_blobs_.size(); ++i) { + InputDebugInfo(i); + } + } + + CPUTimer forward_timer; + CPUTimer layer_timer; + forward_timer.Start(); + + for (int i = start; i <= end; ++i) { + layer_timer.Start(); + Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); + loss += layer_loss; + if (debug_info_) { + ForwardDebugInfo(i); + } + clFinish(amdDevice.CommandQueue); + layer_timer.Stop(); + printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); + } + + forward_timer.Stop(); + printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); + + return loss; +} + +template Dtype Net::ForwardFrom(int start) { - return ForwardFromTo(start, layers_.size() - 1); + return ForwardFromTo(start, layers_.size() - 1); } -template +template Dtype Net::ForwardTo(int end) { - return ForwardFromTo(0, end); + return ForwardFromTo(0, end); } -template +template const vector*>& Net::ForwardPrefilled(Dtype* loss) { - if (loss != NULL) { - *loss = ForwardFromTo(0, layers_.size() - 1); - } else { - ForwardFromTo(0, layers_.size() - 1); - } - return net_output_blobs_; + if (loss != NULL) { + *loss = ForwardFromTo(0, layers_.size() - 1); + } else { + ForwardFromTo(0, layers_.size() - 1); + } + return net_output_blobs_; } -template +template const vector*>& Net::Forward( - const vector*> & bottom, Dtype* loss) { - // Copy bottom to internal bottom - for (int i = 0; i < bottom.size(); ++i) { - net_input_blobs_[i]->CopyFrom(*bottom[i]); - } - return ForwardPrefilled(loss); + const vector*> & bottom, Dtype* loss) { + // Copy bottom to internal bottom + for (int i = 0; i < bottom.size(); ++i) { + net_input_blobs_[i]->CopyFrom(*bottom[i]); + } + return ForwardPrefilled(loss); } -template +template string Net::Forward(const string& input_blob_protos, Dtype* loss) { - BlobProtoVector blob_proto_vec; - if (net_input_blobs_.size()) { - blob_proto_vec.ParseFromString(input_blob_protos); - CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) - << "Incorrect input size."; - for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { - net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); - } - } - ForwardPrefilled(loss); - blob_proto_vec.Clear(); - for (int i = 0; i < net_output_blobs_.size(); ++i) { - net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); - } - string output; - blob_proto_vec.SerializeToString(&output); - return output; -} - -template + BlobProtoVector blob_proto_vec; + if (net_input_blobs_.size()) { + blob_proto_vec.ParseFromString(input_blob_protos); + CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) + << "Incorrect input size."; + for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { + net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); + } + } + ForwardPrefilled(loss); + blob_proto_vec.Clear(); + for (int i = 0; i < net_output_blobs_.size(); ++i) { + net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); + } + string output; + blob_proto_vec.SerializeToString(&output); + return output; +} + +template void Net::BackwardFromTo(int start, int end) { - CHECK_GE(end, 0); - CHECK_LT(start, layers_.size()); - - CPUTimer backward_timer; - CPUTimer layer_timer; - backward_timer.Start(); - - for (int i = start; i >= end; --i) { - layer_timer.Start(); - if (layer_need_backward_[i]) { - layers_[i]->Backward( - top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); - if (debug_info_) { BackwardDebugInfo(i); } - clFinish(amdDevice.CommandQueue); - layer_timer.Start(); - printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); - } - } - - backward_timer.Stop(); - printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); -} - -template + CHECK_GE(end, 0); + CHECK_LT(start, layers_.size()); + + CPUTimer backward_timer; + CPUTimer layer_timer; + backward_timer.Start(); + + for (int i = start; i >= end; --i) { + layer_timer.Start(); + if (layer_need_backward_[i]) { + layers_[i]->Backward( + top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); + if (debug_info_) { + BackwardDebugInfo(i); + } + clFinish(amdDevice.CommandQueue); + layer_timer.Start(); + printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); + } + } + + backward_timer.Stop(); + printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); +} + +template void Net::InputDebugInfo(const int input_id) { - const Blob& blob = *net_input_blobs_[input_id]; - const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; + const Blob& blob = *net_input_blobs_[input_id]; + const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Input " << blob_name << " data: " << data_abs_val_mean; } -template +template void Net::ForwardDebugInfo(const int layer_id) { - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const Blob& blob = *top_vecs_[layer_id][top_id]; - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << data_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const int net_param_id = param_id_vecs_[layer_id][param_id]; - const string& blob_name = param_display_names_[net_param_id]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name - << " data: " << data_abs_val_mean; - } -} - -template + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const Blob& blob = *top_vecs_[layer_id][top_id]; + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name + << " data: " << data_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const int net_param_id = param_id_vecs_[layer_id][param_id]; + const string& blob_name = param_display_names_[net_param_id]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name + << " data: " << data_abs_val_mean; + } +} + +template void Net::BackwardDebugInfo(const int layer_id) { - const vector*>& bottom_vec = bottom_vecs_[layer_id]; - for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { - if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } - const Blob& blob = *bottom_vec[bottom_id]; - const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; - } -} - -template + const vector*>& bottom_vec = bottom_vecs_[layer_id]; + for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { + if (!bottom_need_backward_[layer_id][bottom_id]) { + continue; + } + const Blob& blob = *bottom_vec[bottom_id]; + const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name + << " diff: " << diff_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + if (!layers_[layer_id]->param_propagate_down(param_id)) { + continue; + } + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << param_id + << " diff: " << diff_abs_val_mean; + } +} + +template void Net::UpdateDebugInfo(const int param_id) { - const Blob& blob = *params_[param_id]; - const int param_owner = param_owners_[param_id]; - const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; - const string& param_display_name = param_display_names_[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - if (param_owner < 0) { - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; - } else { - const string& owner_layer_name = - layer_names_[param_layer_indices_[param_owner].first]; - LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " - << "param " << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << diff_abs_val_mean; - } -} - -template + const Blob& blob = *params_[param_id]; + const int param_owner = param_owners_[param_id]; + const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; + const string& param_display_name = param_display_names_[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + if (param_owner < 0) { + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Update] Layer " << layer_name + << ", param " << param_display_name + << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; + } else { + const string& owner_layer_name = + layer_names_[param_layer_indices_[param_owner].first]; + LOG(INFO) << " [Update] Layer " << layer_name + << ", param blob " << param_display_name + << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << diff_abs_val_mean; + } +} + +template void Net::ShareTrainedLayersWith(const Net* other) { - int num_source_layers = other->layers().size(); - for (int i = 0; i < num_source_layers; ++i) { - Layer* source_layer = other->layers()[i].get(); - const string& source_layer_name = other->layer_names()[i]; - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - Blob* source_blob = source_layer->blobs()[j].get(); - CHECK(target_blobs[j]->shape() == source_blob->shape()); - target_blobs[j]->ShareData(*source_blob); - } - } -} - -template + int num_source_layers = other->layers().size(); + for (int i = 0; i < num_source_layers; ++i) { + Layer < Dtype > *source_layer = other->layers()[i].get(); + const string& source_layer_name = other->layer_names()[i]; + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() && + layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector < shared_ptr > > &target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + Blob < Dtype > *source_blob = source_layer->blobs()[j].get(); + CHECK(target_blobs[j]->shape() == source_blob->shape()); + target_blobs[j]->ShareData(*source_blob); + } + } +} + +template void Net::BackwardFrom(int start) { - BackwardFromTo(start, 0); + BackwardFromTo(start, 0); } -template +template void Net::BackwardTo(int end) { - BackwardFromTo(layers_.size() - 1, end); + BackwardFromTo(layers_.size() - 1, end); } -template +template void Net::Backward() { - BackwardFromTo(layers_.size() - 1, 0); - if (debug_info_) { - Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - asum_data += params_[i]->asum_data(); - asum_diff += params_[i]->asum_diff(); - sumsq_data += params_[i]->sumsq_data(); - sumsq_diff += params_[i]->sumsq_diff(); - } - const Dtype l2norm_data = std::sqrt(sumsq_data); - const Dtype l2norm_diff = std::sqrt(sumsq_diff); - LOG(ERROR) << " [Backward] All net params (data, diff): " - << "L1 norm = (" << asum_data << ", " << asum_diff << "); " - << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; - } -} - -template + BackwardFromTo(layers_.size() - 1, 0); + if (debug_info_) { + Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { + continue; + } + asum_data += params_[i]->asum_data(); + asum_diff += params_[i]->asum_diff(); + sumsq_data += params_[i]->sumsq_data(); + sumsq_diff += params_[i]->sumsq_diff(); + } + const Dtype l2norm_data = std::sqrt(sumsq_data); + const Dtype l2norm_diff = std::sqrt(sumsq_diff); + LOG(ERROR) << " [Backward] All net params (data, diff): " + << "L1 norm = (" << asum_data << ", " << asum_diff << "); " + << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; + } +} + +template void Net::Reshape() { - for (int i = 0; i < layers_.size(); ++i) { - layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); - } + for (int i = 0; i < layers_.size(); ++i) { + layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); + } } -template +template void Net::CopyTrainedLayersFrom(const NetParameter& param) { - int num_source_layers = param.layer_size(); - for (int i = 0; i < num_source_layers; ++i) { - const LayerParameter& source_layer = param.layer(i); - const string& source_layer_name = source_layer.name(); - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - const bool kReshape = false; - target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); - } - } -} - -template + int num_source_layers = param.layer_size(); + for (int i = 0; i < num_source_layers; ++i) { + const LayerParameter& source_layer = param.layer(i); + const string& source_layer_name = source_layer.name(); + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() && + layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector < shared_ptr > > &target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + const bool kReshape = false; + target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); + } + } +} + +template void Net::CopyTrainedLayersFrom(const string trained_filename) { - NetParameter param; - ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); - CopyTrainedLayersFrom(param); + NetParameter param; + ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); + CopyTrainedLayersFrom(param); } -template +template void Net::ToProto(NetParameter* param, bool write_diff) const { - param->Clear(); - param->set_name(name_); - // Add bottom and top - for (int i = 0; i < net_input_blob_indices_.size(); ++i) { - param->add_input(blob_names_[net_input_blob_indices_[i]]); - } - DLOG(INFO) << "Serializing " << layers_.size() << " layers"; - for (int i = 0; i < layers_.size(); ++i) { - LayerParameter* layer_param = param->add_layer(); - for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { - layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); - } - for (int j = 0; j < top_id_vecs_[i].size(); ++j) { - layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); - } - layers_[i]->ToProto(layer_param, write_diff); - } -} - -template + param->Clear(); + param->set_name(name_); + // Add bottom and top + for (int i = 0; i < net_input_blob_indices_.size(); ++i) { + param->add_input(blob_names_[net_input_blob_indices_[i]]); + } + DLOG(INFO) << "Serializing " << layers_.size() << " layers"; + for (int i = 0; i < layers_.size(); ++i) { + LayerParameter* layer_param = param->add_layer(); + for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { + layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); + } + for (int j = 0; j < top_id_vecs_[i].size(); ++j) { + layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); + } + layers_[i]->ToProto(layer_param, write_diff); + } +} + +template void Net::Update() { - // First, accumulate the diffs of any shared parameters into their owner's - // diff. (Assumes that the learning rate, weight decay, etc. have already been - // accounted for in the current diff.) - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - const int count = params_[i]->count(); - const Dtype* this_diff; - Dtype* owner_diff; - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; - case Caffe::GPU: -#ifndef CPU_ONLY - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); - caffe_gpu_axpy(count, 1.0, this_diff, owner_diff); + // First, accumulate the diffs of any shared parameters into their owner's + // diff. (Assumes that the learning rate, weight decay, etc. have already been + // accounted for in the current diff.) + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] < 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } + const int count = params_[i]->count(); + const Dtype* this_diff; + Dtype* owner_diff; + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + + switch (Caffe::mode()) { + case Caffe::CPU: + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + caffe_add(count, this_diff, owner_diff, owner_diff); + break; + case Caffe::GPU: + #ifndef CPU_ONLY + this_diff = params_[i]->gpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); + // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff); #else - NO_GPU; + NO_GPU; #endif - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } - } - // Now, update the owned parameters. - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - params_[i]->Update(); - } -} - -template - bool Net::has_blob(const string& blob_name) const { - return blob_names_index_.find(blob_name) != blob_names_index_.end(); -} - -template + break; + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } + } + // Now, update the owned parameters. + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } + params_[i]->Update(); + } +} + +template +bool Net::has_blob(const string& blob_name) const { + return blob_names_index_.find(blob_name) != blob_names_index_.end(); +} + +template const shared_ptr > Net::blob_by_name( - const string& blob_name) const { - shared_ptr > blob_ptr; - if (has_blob(blob_name)) { - blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; - } else { - blob_ptr.reset((Blob*)(NULL)); - LOG(WARNING) << "Unknown blob name " << blob_name; - } - return blob_ptr; -} - -template + const string& blob_name) const { + shared_ptr < Blob > blob_ptr; + if (has_blob(blob_name)) { + blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; + } else { + blob_ptr.reset((Blob*) (NULL)); + LOG(WARNING) << "Unknown blob name " << blob_name; + } + return blob_ptr; +} + +template bool Net::has_layer(const string& layer_name) const { - return layer_names_index_.find(layer_name) != layer_names_index_.end(); + return layer_names_index_.find(layer_name) != layer_names_index_.end(); } -template +template const shared_ptr > Net::layer_by_name( - const string& layer_name) const { - shared_ptr > layer_ptr; - if (has_layer(layer_name)) { - layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; - } else { - layer_ptr.reset((Layer*)(NULL)); - LOG(WARNING) << "Unknown layer name " << layer_name; - } - return layer_ptr; -} - -INSTANTIATE_CLASS(Net); + const string& layer_name) const { + shared_ptr < Layer > layer_ptr; + if (has_layer(layer_name)) { + layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; + } else { + layer_ptr.reset((Layer*) (NULL)); + LOG(WARNING) << "Unknown layer name " << layer_name; + } + return layer_ptr; +} + +INSTANTIATE_CLASS (Net); } // namespace caffe diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl index 03ddba8a..5da76b7e 100644 --- a/src/caffe/ocl/bnll_layer.cl +++ b/src/caffe/ocl/bnll_layer.cl @@ -28,25 +28,25 @@ template __kernel void BNLLForward(const int n, __global const T* in, __global T* out) { - int index = get_global_id(0); - if (index < n) { - out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); - } + int index = get_global_id(0); + if (index < n) { + out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); + } } template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out); template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out); template __kernel void BNLLBackward(const int n, __global const T* in_diff, - __global const T* in_data, __global T* out_diff) { - int index = get_global_id(0); - if (index < n) { - T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); - out_diff[index] = in_diff[index] * expval / (expval + 1.); - } + __global const T* in_data, __global T* out_diff) { + int index = get_global_id(0); + if (index < n) { + T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); + out_diff[index] = in_diff[index] * expval / (expval + 1.); + } } template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff, - __global const float* in_data, __global float* out_diff); + __global const float* in_data, __global float* out_diff); template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff, - __global const double* in_data, __global double* out_diff); + __global const double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl index 71eb8c77..de504dec 100644 --- a/src/caffe/ocl/concat_layer.cl +++ b/src/caffe/ocl/concat_layer.cl @@ -26,29 +26,29 @@ template __kernel void Concat(const int nthreads, __global const T* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global T* out_data) { - int index = get_global_id(0); - if(index < nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } } -template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global float* out_data); -template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global double* out_data); +template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); +template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl index 8ed18ce4..0aeea80c 100644 --- a/src/caffe/ocl/contrastive_loss_layer.cl +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -26,39 +26,39 @@ template __kernel void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, - __global Dtype *bottom_diff) { - int i = get_global_id(0); - if(i < count) { - int n = i / channels; // the num index, to access y and dist_sq - if (static_cast(y[n])) { // similar pairs - bottom_diff[i] = alpha * diff[i]; - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = (margin - dist_sq[n]); - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq[n]); - mdist = (margin - dist); - beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; - } - if (mdist > 0.0) { - bottom_diff[i] = beta; - } else { - bottom_diff[i] = 0; - } - } - } + const Dtype margin, const bool legacy_version, const Dtype alpha, + __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, + __global Dtype *bottom_diff) { + int i = get_global_id(0); + if(i < count) { + int n = i / channels; // the num index, to access y and dist_sq + if (static_cast(y[n])) { // similar pairs + bottom_diff[i] = alpha * diff[i]; + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = (margin - dist_sq[n]); + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq[n]); + mdist = (margin - dist); + beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; + } + if (mdist > 0.0) { + bottom_diff[i] = beta; + } else { + bottom_diff[i] = 0; + } + } + } } template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - __global const float* y, __global const float* diff, __global const float* dist_sq, - __global float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + __global const float* y, __global const float* diff, __global const float* dist_sq, + __global float *bottom_diff); template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - __global const double* y, __global const double* diff, __global const double* dist_sq, - __global double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + __global const double* y, __global const double* diff, __global const double* dist_sq, + __global double *bottom_diff); diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl index 4bfa39bc..bb2fc696 100644 --- a/src/caffe/ocl/dropout_layer.cl +++ b/src/caffe/ocl/dropout_layer.cl @@ -25,20 +25,19 @@ **************************************************************************************/ template -__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){ - int index = get_global_id(0); - if (index < n) - out[index] = in[index] * scale * mask[index]; +__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) { + int index = get_global_id(0); + if (index < n) + out[index] = in[index] * scale * mask[index]; } -template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); - template -__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){ - int index = get_global_id(0); - if (index < n) - out_diff[index] = in_diff[index] * scale * mask[index]; +__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) { + int index = get_global_id(0); + if (index < n) + out_diff[index] = in_diff[index] * scale * mask[index]; } -template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl index d843884a..0e1812d8 100644 --- a/src/caffe/ocl/eltwise_layer.cl +++ b/src/caffe/ocl/eltwise_layer.cl @@ -26,48 +26,48 @@ template __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, - __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, - __global int* mask) { - int index = get_global_id(0); - if(index < nthreads) { - Dtype maxval = -FLT_MAX; - int maxidx = -1; - if (bottom_data_a[index] > bottom_data_b[index]) { - // only update for very first bottom_data blob (blob_idx == 0) - if (blob_idx == 0) { - maxval = bottom_data_a[index]; - top_data[index] = maxval; - maxidx = blob_idx; - mask[index] = maxidx; - } - } else { - maxval = bottom_data_b[index]; - top_data[index] = maxval; - maxidx = blob_idx + 1; - mask[index] = maxidx; - } - } + __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, + __global int* mask) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype maxval = -FLT_MAX; + int maxidx = -1; + if (bottom_data_a[index] > bottom_data_b[index]) { + // only update for very first bottom_data blob (blob_idx == 0) + if (blob_idx == 0) { + maxval = bottom_data_a[index]; + top_data[index] = maxval; + maxidx = blob_idx; + mask[index] = maxidx; + } + } else { + maxval = bottom_data_b[index]; + top_data[index] = maxval; + maxidx = blob_idx + 1; + mask[index] = maxidx; + } + } } template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a, - __global const float* bottom_data_b, const int blob_idx, __global float* top_data, - __global int* mask); + __global const float* bottom_data_b, const int blob_idx, __global float* top_data, + __global int* mask); template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a, - __global const double* bottom_data_b, const int blob_idx, __global double* top_data, - __global int* mask); + __global const double* bottom_data_b, const int blob_idx, __global double* top_data, + __global int* mask); template __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, - const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { - int index = get_global_id(0); - if(index < nthreads) { - Dtype gradient = 0; - if (mask[index] == blob_idx) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } + const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype gradient = 0; + if (mask[index] == blob_idx) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff, - const int blob_idx, __global const int* mask, __global float* bottom_diff); + const int blob_idx, __global const int* mask, __global float* bottom_diff); template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff, - const int blob_idx, __global const int* mask, __global double* bottom_diff); + const int blob_idx, __global const int* mask, __global double* bottom_diff); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 3e535d5f..c08d1310 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -25,267 +25,266 @@ **************************************************************************************/ template -__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){ - int index=get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; - if(index < n){ - int w_out=index %width_col; - index /= width_col; - int h_out=index%height_col; - int channel_in = index/height_col; - int channel_out=channel_in *ksize *ksize; - int h_in = h_out *stride-pad; - int w_in = w_out *stride-pad; - data_col +=(channel_out *height_col + h_out) *width_col + w_out; - data_im +=(channel_in * height + h_in) *width + w_in; - int i=0,j=0; - for(i=0;i= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col +=height_col *width_col; - } - } - } +__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) { + int index=get_global_id(0); + data_im = data_im + img_offset; + data_col = data_col + col_offset; + if(index < n) { + int w_out=index %width_col; + index /= width_col; + int h_out=index%height_col; + int channel_in = index/height_col; + int channel_out=channel_in *ksize *ksize; + int h_in = h_out *stride-pad; + int w_in = w_out *stride-pad; + data_col +=(channel_out *height_col + h_out) *width_col + w_out; + data_im +=(channel_in * height + h_in) *width + w_in; + int i=0,j=0; + for(i=0;i= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col +=height_col *width_col; + } + } + } } -template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); +template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); template -__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){ +__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { - int index = get_global_id(0); + int index = get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; + data_im = data_im + img_offset; + data_col = data_col + col_offset; - int x_out = index % width_col; - int y_out = (index / width_col) % height_col; - int channel_in = (index / width_col / height_col) % channels; - int channel_out = channel_in * ksize * ksize; - int im_id = index / width_col / height_col / channels; + int x_out = index % width_col; + int y_out = (index / width_col) % height_col; + int channel_in = (index / width_col / height_col) % channels; + int channel_out = channel_in * ksize * ksize; + int im_id = index / width_col / height_col / channels; - int y_in = y_out * stride - pad; - int x_in = x_out * stride - pad; - int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; - int offset_im = im_id * channels * height * width + channel_in * height * width; + int y_in = y_out * stride - pad; + int x_in = x_out * stride - pad; + int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; + int offset_im = im_id * channels * height * width + channel_in * height * width; - for(int k_h = 0; k_h < ksize; k_h++){ - for(int k_w = 0; k_w < ksize; k_w++){ - int x_im = x_in + k_w; - int y_im = y_in + k_h; - int index_im = y_im * width + x_im; - int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; - if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) - data_col[offset_col + index_col] = data_im[offset_im + index_im]; - else - data_col[offset_col + index_col] = 0; - } - } + for(int k_h = 0; k_h < ksize; k_h++) { + for(int k_w = 0; k_w < ksize; k_w++) { + int x_im = x_in + k_w; + int y_im = y_in + k_h; + int index_im = y_im * width + x_im; + int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) + data_col[offset_col + index_col] = data_im[offset_im + index_im]; + else + data_col[offset_col + index_col] = 0; + } + } } -template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); -template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); - +template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); template __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_col, const int col_offset) { - data_im = data_im + img_offset; - data_col = data_col + col_offset; + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { + data_im = data_im + img_offset; + data_col = data_col + col_offset; - int index = get_global_id(0); - if(index < n) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - __global T* data_col_ptr = data_col; - data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; - __global const T* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + i; - int w = w_in + j; - *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; - data_col_ptr += height_col * width_col; - } - } - } + int index = get_global_id(0); + if(index < n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + __global T* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + __global const T* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } } template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global double* data_col, const int col_offset); + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); template __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_im, const int img_offset) { - data_col = data_col + col_offset; - data_im = data_im + img_offset; - int index = get_global_id(0); - if(index < n) { - T val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; - int w_col_end = min(w / stride_w + 1, width_col); - int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; - int h_col_end = min(h / stride_h + 1, height_col); - // equivalent implementation - int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; - int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { + data_col = data_col + col_offset; + data_im = data_im + img_offset; + int index = get_global_id(0); + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w,const int pad_h, const int pad_w, - const int stride_h, const int stride_w,const int height_col, const int width_col, - __global float* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, - const int col_offset, const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); template -__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){ - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } +__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) { + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n) { + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } -template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); +template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); template -__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){ - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n){ - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height) % channels; - int im = index / width / height / channels; - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col * optnum); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } +__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n) { + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col * optnum); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } -template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); -template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); template -__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){ +__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) { - int index = get_global_id(0); - data_opt = data_opt + opt_offset; - data_im = data_im + im_offset; - if(index < n){ - int w = index % width; - int h = (index / width) % height; - int c = index / (width * height) % channels; - int im = index / width / height / channels; + int index = get_global_id(0); + data_opt = data_opt + opt_offset; + data_im = data_im + im_offset; + if(index < n) { + int w = index % width; + int h = (index / width) % height; + int c = index / (width * height) % channels; + int im = index / width / height / channels; - int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; - data_opt[opt_index] = data_im[index]; - } + int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; + data_opt[opt_index] = data_im[index]; + } } -template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); -template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); template -__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidyy = gidy; - int index = gidy / height; - int offset = index * width * height; - gidy = gidy % height; - if( gidx < width && gidyy < height * optnum ) - dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; +__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) { + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; } -template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); template -__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){ - int gidx = get_global_id(0); - int index; - index = (optnum==1) ? 0: gidx % optnum; - dst = dst + top_offset; // now we point at (*top)[n] - int offset = gidx / optnum; - int i = 0; - for(i = 0 ; i < width; i++) - dst[(index * height + offset)* width + i] = src[gidx * width + i]; +__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) { + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; } -template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); -template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl index ae1c9269..620bad72 100644 --- a/src/caffe/ocl/lrn_layer.cl +++ b/src/caffe/ocl/lrn_layer.cl @@ -26,113 +26,113 @@ template __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) - out[index] = in[index] * pow(scale[index], negative_beta); + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) + out[index] = in[index] * pow(scale[index], negative_beta); } template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); template -__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - in = in + offset; - scale = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in[head * step] * in[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in[head * step] * in[head * step]; - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } +__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + in = in + offset; + scale = scale + offset; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + } } -template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); template __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - bottom_data += offset; - top_data += offset; - scale += offset; - top_diff += offset; - bottom_diff += offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } -} + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_ratio = 0; + // accumulate values + while (head < post_pad && head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + } } template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 10d3b9f5..11352e16 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -25,220 +25,220 @@ **************************************************************************************/ template -__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp){ - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - T maxval = -FLT_MAX; - int maxidx = -1; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_data[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_data[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_data[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_data[maxidx]; + } + } + } + top_data[index] = maxval; + if (mask) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } } template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); -template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); +template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); template -__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp){ - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - T aveval = 0; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_data[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } +__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + T aveval = 0; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_data[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } } template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); template -__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp){ - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - T cumsum = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_data[h * width + w]; - return; - } - } - } - } +__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + T cumsum = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + } + } + const float thres = rand_idx[index] * cumsum; + // Second pass: get value, and set index. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + if (cumsum >= thres) { + rand_idx[index] = ((n * channels + c) * height + h) * width + w; + top_data[index] = bottom_data[h * width + w]; + return; + } + } + } + } } template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); template -__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){ - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < count; index+=tmp){ - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; - T cumsum = FLT_MIN; - T cumvalues = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum; } +__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < count; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; + T cumsum = FLT_MIN; + T cumvalues = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum;} } template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); template __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, - __global int* mask, __global T* top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, __global T* const bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - T gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - top_diff += offset; - if (mask) { - mask = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } else { - top_mask = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } + __global int* mask, __global T* top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + T gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + if (mask) { + mask = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } else { + top_mask = top_mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template -__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){ - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = (index / width / height) % channels; - int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - T gradient = 0; - top_diff += (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); @@ -246,48 +246,48 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave template __kernel void StoPoolBackward(const int nthreads, - __global Dtype* rand_idx, __global Dtype* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global Dtype* bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total){ - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - rand_idx = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - top_diff = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff[ph * pooled_width + pw] * - (index == static_cast(rand_idx[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; + __global Dtype* rand_idx, __global Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global Dtype* bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + rand_idx = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + top_diff = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (index == static_cast(rand_idx[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; - } + } } -template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* bottom_diff); + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index 6a45ea03..5fbea781 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -26,35 +26,35 @@ template __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) { - int index = get_global_id(0); - if(index < count){ - int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; - } + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; + } } template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor); template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor); template __kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) { - int index = get_global_id(0); - if(index < count){ - int c = (index / dim) % channels / div_factor; - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); - } + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + + (in_data[index] <= 0) * slope_data[c]); + } } template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor); template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); template __kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) { - int index = get_global_id(0); - if(index < count){ - in_diff += offset_out; - out_diff += offset_in; - out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); - } + int index = get_global_id(0); + if(index < count) { + in_diff += offset_out; + out_diff += offset_in; + out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); + } } -template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff); template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index f5a7a4db..94a41db4 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -30,23 +30,26 @@ //we use the open sourced threefry's GPU implementation typedef uint uint32_t; -struct r123array4x32 { uint32_t v[4]; }; +struct r123array4x32 { + uint32_t v[4]; +}; -enum r123_enum_threefry32x4 +enum r123_enum_threefry32x4 { R_32x4_0_0 = 10, R_32x4_0_1 = 26, R_32x4_1_0 = 11, R_32x4_1_1 = 21, R_32x4_2_0 = 13, R_32x4_2_1 = 27, - R_32x4_3_0 = 23, R_32x4_3_1 = 5, - R_32x4_4_0 = 6, R_32x4_4_1 = 20, + R_32x4_3_0 = 23, R_32x4_3_1 = 5, + R_32x4_4_0 = 6, R_32x4_4_1 = 20, R_32x4_5_0 = 17, R_32x4_5_1 = 11, R_32x4_6_0 = 25, R_32x4_6_1 = 10, R_32x4_7_0 = 18, R_32x4_7_1 = 20 }; -inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); inline uint32_t RotL_32(uint32_t x, unsigned int N) -{ + __attribute__((always_inline)); +inline uint32_t RotL_32(uint32_t x, unsigned int N) + { return (x << (N & 31)) | (x >> ((32 - N) & 31)); } @@ -54,20 +57,22 @@ typedef struct r123array4x32 threefry4x32_ctr_t; typedef struct r123array4x32 threefry4x32_key_t; typedef struct r123array4x32 threefry4x32_ukey_t; -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) -{ - threefry4x32_ctr_t X; - uint32_t ks[4 + 1]; - int i; +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) + { + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; ks[4] = 0x1BD11BDA; /* - for (i = 0; i < 4; i++) - { - ks[i] = k.v[i]; - X.v[i] = in.v[i]; - ks[4] ^= k.v[i]; - }*/ + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ { ks[0] = k.v[0]; X.v[0] = in.v[0]; @@ -89,660 +94,748 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_ X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; - if (Nrounds > 0) - { + if (Nrounds > 0) + { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 1) { + } + if (Nrounds > 1) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 2) { + } + if (Nrounds > 2) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 3) { + } + if (Nrounds > 3) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 3) { + } + if (Nrounds > 3) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 1; - } if (Nrounds > 4) { + } + if (Nrounds > 4) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 5) { + } + if (Nrounds > 5) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 6) { + } + if (Nrounds > 6) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 7) { + } + if (Nrounds > 7) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 7) { + } + if (Nrounds > 7) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 2; - } if (Nrounds > 8) { + } + if (Nrounds > 8) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 9) { + } + if (Nrounds > 9) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 10) { + } + if (Nrounds > 10) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 11) { + } + if (Nrounds > 11) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 11) { + } + if (Nrounds > 11) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 3; - } if (Nrounds > 12) { + } + if (Nrounds > 12) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 13) { + } + if (Nrounds > 13) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 14) { + } + if (Nrounds > 14) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 15) { + } + if (Nrounds > 15) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 15) { + } + if (Nrounds > 15) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 4; - } if (Nrounds > 16) { + } + if (Nrounds > 16) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 17) { + } + if (Nrounds > 17) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 18) { + } + if (Nrounds > 18) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 19) { + } + if (Nrounds > 19) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 19) { + } + if (Nrounds > 19) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 5; - } if (Nrounds > 20) { + } + if (Nrounds > 20) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 21) { + } + if (Nrounds > 21) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 22) { + } + if (Nrounds > 22) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 23) { + } + if (Nrounds > 23) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 23) { + } + if (Nrounds > 23) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 6; - } if (Nrounds > 24) { + } + if (Nrounds > 24) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 25) { + } + if (Nrounds > 25) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 26) { + } + if (Nrounds > 26) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 27) { + } + if (Nrounds > 27) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 27) { + } + if (Nrounds > 27) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 7; - } if (Nrounds > 28) { + } + if (Nrounds > 28) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 29) { + } + if (Nrounds > 29) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 30) { + } + if (Nrounds > 30) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 31) { + } + if (Nrounds > 31) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 31) { + } + if (Nrounds > 31) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 8; - } if (Nrounds > 32) { + } + if (Nrounds > 32) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 33) { + } + if (Nrounds > 33) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 34) { + } + if (Nrounds > 34) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 35) { + } + if (Nrounds > 35) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 35) { + } + if (Nrounds > 35) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 9; - } if (Nrounds > 36) { + } + if (Nrounds > 36) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 37) { + } + if (Nrounds > 37) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 38) { + } + if (Nrounds > 38) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 39) { + } + if (Nrounds > 39) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 39) { + } + if (Nrounds > 39) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 10; - } if (Nrounds > 40) { + } + if (Nrounds > 40) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 41) { + } + if (Nrounds > 41) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 42) { + } + if (Nrounds > 42) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 43) { + } + if (Nrounds > 43) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 43) { + } + if (Nrounds > 43) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 11; - } if (Nrounds > 44) { + } + if (Nrounds > 44) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 45) { + } + if (Nrounds > 45) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 46) { + } + if (Nrounds > 46) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 47) { + } + if (Nrounds > 47) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 47) { + } + if (Nrounds > 47) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 12; - } if (Nrounds > 48) { + } + if (Nrounds > 48) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 49) { + } + if (Nrounds > 49) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 50) { + } + if (Nrounds > 50) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 51) { + } + if (Nrounds > 51) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 51) { + } + if (Nrounds > 51) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 13; - } if (Nrounds > 52) { + } + if (Nrounds > 52) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 53) { + } + if (Nrounds > 53) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 54) { + } + if (Nrounds > 54) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 55) { + } + if (Nrounds > 55) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 55) { + } + if (Nrounds > 55) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 14; - } if (Nrounds > 56) { + } + if (Nrounds > 56) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 57) { + } + if (Nrounds > 57) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 58) { + } + if (Nrounds > 58) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 59) { + } + if (Nrounds > 59) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 59) { + } + if (Nrounds > 59) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 15; - } if (Nrounds > 60) { + } + if (Nrounds > 60) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 61) { + } + if (Nrounds > 61) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 62) { + } + if (Nrounds > 62) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 63) { + } + if (Nrounds > 63) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 63) { + } + if (Nrounds > 63) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 16; - } if (Nrounds > 64) { + } + if (Nrounds > 64) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 65) { + } + if (Nrounds > 65) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 66) { + } + if (Nrounds > 66) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 67) { + } + if (Nrounds > 67) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 67) { + } + if (Nrounds > 67) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 17; - } if (Nrounds > 68) { + } + if (Nrounds > 68) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 69) { + } + if (Nrounds > 69) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 70) { + } + if (Nrounds > 70) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } if (Nrounds > 71) { + } + if (Nrounds > 71) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } if (Nrounds > 71) { + } + if (Nrounds > 71) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 18; - } + } return X; -} +} template __kernel void PRNG_threefry4x32( - __global uint4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T inf, - T sup, - T threshold, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - threefry4x32_ctr_t random4; + threefry4x32_ctr_t random4; - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - uint4 frnd; - - frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - - randomnumber[gdx] = frnd; - } -} + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } +} template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index b7865838..cf9302d5 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -25,21 +25,21 @@ **************************************************************************************/ template -__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){ +__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) { int index = get_global_id(0); if(index < count) - out[index] = in[index] > 0? in[index]:in[index]*negative_slope; + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; } template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); template -__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){ +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) { int index = get_global_id(0); - if(index < count) { - out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); - } + if(index < count) { + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); + } } template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl index eb952e6f..a3a9345f 100644 --- a/src/caffe/ocl/sigmoid_layer.cl +++ b/src/caffe/ocl/sigmoid_layer.cl @@ -25,21 +25,21 @@ **************************************************************************************/ template -__kernel void SigmoidForward(const int count, __global T* in, __global T* out){ +__kernel void SigmoidForward(const int count, __global T* in, __global T* out) { int index = get_global_id(0); if(index < count) - out[index] = 1. / (1. + exp(-in[index])); + out[index] = 1. / (1. + exp(-in[index])); } template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out); template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out); template -__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){ +__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { int index = get_global_id(0); - const T sigmoid_x = out_data[index]; - if(index < count) - out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); + const T sigmoid_x = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); } template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl index 6b225283..4069ce16 100644 --- a/src/caffe/ocl/softmax_layer.cl +++ b/src/caffe/ocl/softmax_layer.cl @@ -25,49 +25,49 @@ **************************************************************************************/ template -__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){ - - int gid = get_global_id(0); - int size = get_global_size(0); - - resultScratch[gid] = 0.0; - for(int i = gid; i < num; i += size){ - resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(gid < 128) - resultScratch[gid] += resultScratch[gid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - if(gid < 64) - resultScratch[gid] += resultScratch[gid + 64]; - if(gid < 32) - resultScratch[gid] += resultScratch[gid + 32]; - if(gid < 16) - resultScratch[gid] += resultScratch[gid + 16]; - if(gid < 8) - resultScratch[gid] += resultScratch[gid + 8]; - if(gid < 4) - resultScratch[gid] += resultScratch[gid + 4]; - if(gid < 2) - resultScratch[gid] += resultScratch[gid + 2]; - if(gid < 1){ - resultScratch[gid] += resultScratch[gid + 1]; - loss[0] = resultScratch[gid]; - } +__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) { + + int gid = get_global_id(0); + int size = get_global_size(0); + + resultScratch[gid] = 0.0; + for(int i = gid; i < num; i += size) { + resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gid < 128) + resultScratch[gid] += resultScratch[gid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + if(gid < 64) + resultScratch[gid] += resultScratch[gid + 64]; + if(gid < 32) + resultScratch[gid] += resultScratch[gid + 32]; + if(gid < 16) + resultScratch[gid] += resultScratch[gid + 16]; + if(gid < 8) + resultScratch[gid] += resultScratch[gid + 8]; + if(gid < 4) + resultScratch[gid] += resultScratch[gid + 4]; + if(gid < 2) + resultScratch[gid] += resultScratch[gid + 2]; + if(gid < 1) { + resultScratch[gid] += resultScratch[gid + 1]; + loss[0] = resultScratch[gid]; + } } template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); template -__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){ - //printf("softmax_div\n"); - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num*dim; index += total){ - int n = index / dim; - data[index] /= scale[n]; - } +__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) { + //printf("softmax_div\n"); + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num*dim; index += total) { + int n = index / dim; + data[index] /= scale[n]; + } } template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); @@ -75,97 +75,97 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma template __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } } template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* out); + const int spatial_dim, __global const float* data, __global float* out); template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* out); + const int spatial_dim, __global const double* data, __global double* out); template __kernel void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_max, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } } template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); template __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* channel_sum) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } } template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* channel_sum); + const int spatial_dim, __global const float* data, __global float* channel_sum); template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* channel_sum); + const int spatial_dim, __global const double* data, __global double* channel_sum); template __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_sum, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } } template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const float* channel_sum, __global float* data); + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const double* channel_sum, __global double* data); - + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); + template __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const T* data_1, __global const T* data_2, - __global T* channel_dot) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } } template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const float* data_1, __global const float* data_2, - __global float* channel_dot); + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const double* data_1, __global const double* data_2, - __global double* channel_dot); + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index 9dbe284f..025f59ac 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -26,78 +26,78 @@ template __kernel void SoftmaxLossForwardGPU(const int nthreads, - __global T* prob_data, __global T* label,__global T* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global T* counts) { - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - T(FLT_MIN))); - counts[index] = 1; - } - } + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + if (has_ignore_label_ && label_value == ignore_label_) { + loss[index] = 0; + counts[index] = 0; + } else { + loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], + T(FLT_MIN))); + counts[index] = 1; + } + } } template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global float* prob_data, __global float* label,__global float* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global float* counts); + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global double* prob_data, __global double* label,__global double* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global double* counts); + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); template __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, - __global T* label,__global T* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, T* counts) { - const int channels = dim / spatial_dim; - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { + const int channels = dim / spatial_dim; + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < channels; ++c) { + bottom_diff[n * dim + c * spatial_dim + s] = 0; + } + counts[index] = 0; + } else { + bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; + counts[index] = 1; + } + } } template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, - __global float* label,__global float* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, float* counts); + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); -template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, - __global double* label,__global double* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, double* counts); +template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); template -__kernel void scal (const int num, const T alpha, __global T* data){ - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num; index += total){ - data[index] = data[index] * alpha; - } +__kernel void scal (const int num, const T alpha, __global T* data) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total) { + data[index] = data[index] * alpha; + } } -template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); -template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); +template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); +template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl index 2f0a08c6..a8bd05c9 100644 --- a/src/caffe/ocl/tanh_layer.cl +++ b/src/caffe/ocl/tanh_layer.cl @@ -25,21 +25,21 @@ **************************************************************************************/ template -__kernel void TanHForward(const int count, __global T* in, __global T* out){ +__kernel void TanHForward(const int count, __global T* in, __global T* out) { int index = get_global_id(0); if(index < count) - out[index] =tanh(in[index]); + out[index] =tanh(in[index]); } template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out); template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out); template -__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){ +__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { int index = get_global_id(0); - const T tanhx = out_data[index]; - if(index < count) - out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); + const T tanhx = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); } template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl index 40d55f1c..19df83e2 100644 --- a/src/caffe/ocl/threshold_layer.cl +++ b/src/caffe/ocl/threshold_layer.cl @@ -25,10 +25,10 @@ **************************************************************************************/ template -__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out){ +__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) { int index = get_global_id(0); if(index < count) - out[index] =in[index] > threshold ? 1 : 0; + out[index] =in[index] > threshold ? 1 : 0; } template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out); diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index d15f168c..07a16fbd 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -27,10 +27,10 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable template -__kernel void OCL_memset(__global T* buffer, const T value, const int size){ +__kernel void OCL_memset(__global T* buffer, const T value, const int size) { int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; + if(gdx < size) { + buffer[gdx] = value; } } @@ -38,19 +38,19 @@ template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__gl template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); -__kernel void OCL_memset2(__global int* buffer, const int value, const int size){ - int gdx = get_global_id(0); - if(gdx < size){ - buffer[gdx] = value; - } +__kernel void OCL_memset2(__global int* buffer, const int value, const int size) { + int gdx = get_global_id(0); + if(gdx < size) { + buffer[gdx] = value; + } } template -__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){ - int gdx = get_global_id(0); - if(gdx < N){ - Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); - } +__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) { + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); + } } template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); @@ -58,33 +58,33 @@ template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caff template __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) { - int index = get_global_id(0); - if(index < n) { - y[index] = fabs(a[index]); - } + int index = get_global_id(0); + if(index < n) { + y[index] = fabs(a[index]); + } } template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y); template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y); template -__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) { - T maxval = -FLT_MAX; - for (int i = 0; i < dim; i++) - maxval = max( data[index*dim + i], maxval ); - out[index] = maxval; - } +__kernel void get_max(const int num, const int dim, __global T* data, __global T* out) { + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } } template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); template -__kernel void exp (const int num, __global T* data, __global T* out){ - int index = get_global_id(0); - if (index < num) - out[index] = exp(data[index]); +__kernel void exp (const int num, __global T* data, __global T* out) { + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); } template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); @@ -92,10 +92,10 @@ template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int template __kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] - b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] - b[index]; + } } template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out); @@ -103,10 +103,10 @@ template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_ template __kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] + b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] + b[index]; + } } template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out); @@ -114,10 +114,10 @@ template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_ template __kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] / b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] / b[index]; + } } template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out); @@ -125,34 +125,32 @@ template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_ template __kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] * b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] * b[index]; + } } template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out); template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out); - template __kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = pow(data[index], alpha); - } + int index = get_global_id(0); + if(index < count) { + out[index] = pow(data[index], alpha); + } } template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out); template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out); - template __kernel void kernel_exp(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = exp(data[index]); - } + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } } template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); @@ -160,10 +158,10 @@ template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_ template __kernel void kernel_add_scalar(const int count, const T data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = out[index] + data; - } + int index = get_global_id(0); + if(index < count) { + out[index] = out[index] + data; + } } template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out); @@ -171,79 +169,76 @@ template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void template __kernel void kernel_log(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = log(data[index]); - } + int index = get_global_id(0); + if(index < count) { + out[index] = log(data[index]); + } } template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out); template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out); template -__kernel void diff (const int num, const int dim, __global T* data, __global T* label){ - int index = get_global_id(0); - int total = get_global_size(0); - int offset; - for(index; index < num; index += total){ - offset = (int) label[index]; - data[index * dim + offset] -= 1; - } +__kernel void diff (const int num, const int dim, __global T* data, __global T* label) { + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total) { + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } } template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); - template -__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){ +__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) { int index = get_global_id(0); - if (index < n) - y[index] = a[index] / b[index]; + if (index < n) + y[index] = a[index] / b[index]; } template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); //template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); template -__kernel void add_scalar (const int n, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] += alpha; +__kernel void add_scalar (const int n, const T alpha, __global T* y) { + int index = get_global_id(0); + if (index < n) + y[index] += alpha; } template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); template -__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ - int index = get_global_id(0); - if (index < n) - y[index] = in1[index] + in2[index] ; +__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index]; } template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); template -__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){ - int index = get_global_id(0); - if (index < n) - y[index] = a[index] * b[index]; +__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) { + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; } template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); - template -__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){ - int index = get_global_id(0); - if (index < n) +__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) { + int index = get_global_id(0); + if (index < n) // y[index] = a[index] + alpha; - y[index] = pow(a[index], alpha); + y[index] = pow(a[index], alpha); } -template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); -template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); - +template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); +template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index cd9d2ef5..ae675500 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -13,398 +13,397 @@ #include "caffe/util/ocl_wrapper.hpp" namespace caffe { -template +template Solver::Solver(const SolverParameter& param) - : net_() { - Init(param); + : net_() { + Init(param); } -template -void Solver::ocl_setup(){ - scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); - add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); - div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); - powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); +template +void Solver::ocl_setup() { + scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); + add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); + div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); + powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } -template +template Solver::Solver(const string& param_file) - : net_() { - SolverParameter param; - ReadProtoFromTextFileOrDie(param_file, ¶m); - Init(param); + : net_() { + SolverParameter param; + ReadProtoFromTextFileOrDie(param_file, ¶m); + Init(param); } -template +template void Solver::Init(const SolverParameter& param) { - LOG(INFO) << "Initializing solver from parameters: " << std::endl - << param.DebugString(); - param_ = param; - CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; - - ocl_setup(); - - if (param_.random_seed() >= 0) { - Caffe::set_random_seed(param_.random_seed()); - } - // Scaffolding code - InitTrainNet(); - InitTestNets(); - LOG(INFO) << "Solver scaffolding done."; - iter_ = 0; - current_step_ = 0; + LOG(INFO) << "Initializing solver from parameters: " << std::endl + << param.DebugString(); + param_ = param; + CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; + + ocl_setup(); + + if (param_.random_seed() >= 0) { + Caffe::set_random_seed(param_.random_seed()); + } + // Scaffolding code + InitTrainNet(); + InitTestNets(); + LOG(INFO) << "Solver scaffolding done."; + iter_ = 0; + current_step_ = 0; } -template +template void Solver::InitTrainNet() { - const int num_train_nets = param_.has_net() + param_.has_net_param() + - param_.has_train_net() + param_.has_train_net_param(); - const string& field_names = "net, net_param, train_net, train_net_param"; - CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " - << "using one of these fields: " << field_names; - CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " - << "one of these fields specifying a train_net: " << field_names; - NetParameter net_param; - if (param_.has_train_net_param()) { - LOG(INFO) << "Creating training net specified in train_net_param."; - net_param.CopyFrom(param_.train_net_param()); - } else if (param_.has_train_net()) { - LOG(INFO) << "Creating training net from train_net file: " - << param_.train_net(); - ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); - } - if (param_.has_net_param()) { - LOG(INFO) << "Creating training net specified in net_param."; - net_param.CopyFrom(param_.net_param()); - } - if (param_.has_net()) { - LOG(INFO) << "Creating training net from net file: " << param_.net(); - ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); - } - // Set the correct NetState. We start with the solver defaults (lowest - // precedence); then, merge in any NetState specified by the net_param itself; - // finally, merge in any NetState specified by the train_state (highest - // precedence). - NetState net_state; - net_state.set_phase(TRAIN); - net_state.MergeFrom(net_param.state()); - net_state.MergeFrom(param_.train_state()); - net_param.mutable_state()->CopyFrom(net_state); - net_.reset(new Net(net_param)); + const int num_train_nets = param_.has_net() + param_.has_net_param() + + param_.has_train_net() + param_.has_train_net_param(); + const string& field_names = "net, net_param, train_net, train_net_param"; + CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " + << "using one of these fields: " << field_names; + CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " + << "one of these fields specifying a train_net: " << field_names; + NetParameter net_param; + if (param_.has_train_net_param()) { + LOG(INFO) << "Creating training net specified in train_net_param."; + net_param.CopyFrom(param_.train_net_param()); + } else if (param_.has_train_net()) { + LOG(INFO) << "Creating training net from train_net file: " + << param_.train_net(); + ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); + } + if (param_.has_net_param()) { + LOG(INFO) << "Creating training net specified in net_param."; + net_param.CopyFrom(param_.net_param()); + } + if (param_.has_net()) { + LOG(INFO) << "Creating training net from net file: " << param_.net(); + ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); + } + // Set the correct NetState. We start with the solver defaults (lowest + // precedence); then, merge in any NetState specified by the net_param itself; + // finally, merge in any NetState specified by the train_state (highest + // precedence). + NetState net_state; + net_state.set_phase(TRAIN); + net_state.MergeFrom(net_param.state()); + net_state.MergeFrom(param_.train_state()); + net_param.mutable_state()->CopyFrom(net_state); + net_.reset(new Net(net_param)); } -template +template void Solver::InitTestNets() { - const bool has_net_param = param_.has_net_param(); - const bool has_net_file = param_.has_net(); - const int num_generic_nets = has_net_param + has_net_file; - CHECK_LE(num_generic_nets, 1) - << "Both net_param and net_file may not be specified."; - const int num_test_net_params = param_.test_net_param_size(); - const int num_test_net_files = param_.test_net_size(); - const int num_test_nets = num_test_net_params + num_test_net_files; - if (num_generic_nets) { - CHECK_GE(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; - } else { - CHECK_EQ(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; - } - // If we have a generic net (specified by net or net_param, rather than - // test_net or test_net_param), we may have an unlimited number of actual - // test networks -- the actual number is given by the number of remaining - // test_iters after any test nets specified by test_net_param and/or test_net - // are evaluated. - const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; - const int num_test_net_instances = num_test_nets + num_generic_net_instances; - if (param_.test_state_size()) { - CHECK_EQ(param_.test_state_size(), num_test_net_instances) - << "test_state must be unspecified or specified once per test net."; - } - if (num_test_net_instances) { - CHECK_GT(param_.test_interval(), 0); - } - int test_net_id = 0; - vector sources(num_test_net_instances); - vector net_params(num_test_net_instances); - for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { - sources[test_net_id] = "test_net_param"; - net_params[test_net_id].CopyFrom(param_.test_net_param(i)); - } - for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { - sources[test_net_id] = "test_net file: " + param_.test_net(i); - ReadNetParamsFromTextFileOrDie(param_.test_net(i), - &net_params[test_net_id]); - } - const int remaining_test_nets = param_.test_iter_size() - test_net_id; - if (has_net_param) { - for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { - sources[test_net_id] = "net_param"; - net_params[test_net_id].CopyFrom(param_.net_param()); - } - } - if (has_net_file) { - for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { - sources[test_net_id] = "net file: " + param_.net(); - ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); - } - } - test_nets_.resize(num_test_net_instances); - for (int i = 0; i < num_test_net_instances; ++i) { - // Set the correct NetState. We start with the solver defaults (lowest - // precedence); then, merge in any NetState specified by the net_param - // itself; finally, merge in any NetState specified by the test_state - // (highest precedence). - NetState net_state; - net_state.set_phase(TEST); - net_state.MergeFrom(net_params[i].state()); - if (param_.test_state_size()) { - net_state.MergeFrom(param_.test_state(i)); - } - net_params[i].mutable_state()->CopyFrom(net_state); - LOG(INFO) - << "Creating test net (#" << i << ") specified by " << sources[i]; - test_nets_[i].reset(new Net(net_params[i])); - test_nets_[i]->set_debug_info(param_.debug_info()); - } + const bool has_net_param = param_.has_net_param(); + const bool has_net_file = param_.has_net(); + const int num_generic_nets = has_net_param + has_net_file; + CHECK_LE(num_generic_nets, 1) + << "Both net_param and net_file may not be specified."; + const int num_test_net_params = param_.test_net_param_size(); + const int num_test_net_files = param_.test_net_size(); + const int num_test_nets = num_test_net_params + num_test_net_files; + if (num_generic_nets) { + CHECK_GE(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; + } else { + CHECK_EQ(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; + } + // If we have a generic net (specified by net or net_param, rather than + // test_net or test_net_param), we may have an unlimited number of actual + // test networks -- the actual number is given by the number of remaining + // test_iters after any test nets specified by test_net_param and/or test_net + // are evaluated. + const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; + const int num_test_net_instances = num_test_nets + num_generic_net_instances; + if (param_.test_state_size()) { + CHECK_EQ(param_.test_state_size(), num_test_net_instances) + << "test_state must be unspecified or specified once per test net."; + } + if (num_test_net_instances) { + CHECK_GT(param_.test_interval(), 0); + } + int test_net_id = 0; + vector < string > sources(num_test_net_instances); + vector < NetParameter > net_params(num_test_net_instances); + for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { + sources[test_net_id] = "test_net_param"; + net_params[test_net_id].CopyFrom(param_.test_net_param(i)); + } + for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { + sources[test_net_id] = "test_net file: " + param_.test_net(i); + ReadNetParamsFromTextFileOrDie(param_.test_net(i), + &net_params[test_net_id]); + } + const int remaining_test_nets = param_.test_iter_size() - test_net_id; + if (has_net_param) { + for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { + sources[test_net_id] = "net_param"; + net_params[test_net_id].CopyFrom(param_.net_param()); + } + } + if (has_net_file) { + for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { + sources[test_net_id] = "net file: " + param_.net(); + ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); + } + } + test_nets_.resize(num_test_net_instances); + for (int i = 0; i < num_test_net_instances; ++i) { + // Set the correct NetState. We start with the solver defaults (lowest + // precedence); then, merge in any NetState specified by the net_param + // itself; finally, merge in any NetState specified by the test_state + // (highest precedence). + NetState net_state; + net_state.set_phase(TEST); + net_state.MergeFrom(net_params[i].state()); + if (param_.test_state_size()) { + net_state.MergeFrom(param_.test_state(i)); + } + net_params[i].mutable_state()->CopyFrom(net_state); + LOG(INFO) + << "Creating test net (#" << i << ") specified by " << sources[i]; + test_nets_[i].reset(new Net(net_params[i])); + test_nets_[i]->set_debug_info(param_.debug_info()); + } } -template +template void Solver::Step(int iters) { - vector*> bottom_vec; - const int start_iter = iter_; - const int stop_iter = iter_ + iters; - int average_loss = this->param_.average_loss(); - vector losses; - Dtype smoothed_loss = 0; - - while (iter_ < stop_iter) { - // zero-init the params - for (int i = 0; i < net_->params().size(); ++i) { - shared_ptr > blob = net_->params()[i]; - switch (Caffe::mode()) { - case Caffe::CPU: - caffe_set(blob->count(), static_cast(0), - blob->mutable_cpu_diff()); - break; - case Caffe::GPU: -#ifndef CPU_ONLY - caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + vector*> bottom_vec; + const int start_iter = iter_; + const int stop_iter = iter_ + iters; + int average_loss = this->param_.average_loss(); + vector < Dtype > losses; + Dtype smoothed_loss = 0; + + while (iter_ < stop_iter) { + // zero-init the params + for (int i = 0; i < net_->params().size(); ++i) { + shared_ptr < Blob > blob = net_->params()[i]; + switch (Caffe::mode()) { + case Caffe::CPU: + caffe_set(blob->count(), static_cast(0), + blob->mutable_cpu_diff()); + break; + case Caffe::GPU: + #ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - case Caffe::APU: -#ifndef CPU_ONLY - caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + case Caffe::APU: + #ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - } - - if (param_.test_interval() && iter_ % param_.test_interval() == 0 - && (iter_ > 0 || param_.test_initialization())) { - TestAll(); - } - - const bool display = param_.display() && iter_ % param_.display() == 0; - net_->set_debug_info(display && param_.debug_info()); - // accumulate the loss and gradient - Dtype loss = 0; - for (int i = 0; i < param_.iter_size(); ++i) { - loss += net_->ForwardBackward(bottom_vec); - } - loss /= param_.iter_size(); - // average the loss across iterations for smoothed reporting - if (losses.size() < average_loss) { - losses.push_back(loss); - int size = losses.size(); - smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; - } else { - int idx = (iter_ - start_iter) % average_loss; - smoothed_loss += (loss - losses[idx]) / average_loss; - losses[idx] = loss; - printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx); - } - printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", smoothed_loss,average_loss, losses.size()); - if (display) { - LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; - const vector*>& result = net_->output_blobs(); - int score_index = 0; - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - const string& output_name = - net_->blob_names()[net_->output_blob_indices()[j]]; - const Dtype loss_weight = - net_->blob_loss_weights()[net_->output_blob_indices()[j]]; - for (int k = 0; k < result[j]->count(); ++k) { - ostringstream loss_msg_stream; - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * result_vec[k] << " loss)"; - } - LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); - } - } - } - ApplyUpdate(); - - // Increment the internal iter_ counter -- its value should always indicate - // the number of times the weights have been updated. - ++iter_; - - // Save a snapshot if needed. - if (param_.snapshot() && iter_ % param_.snapshot() == 0) { - Snapshot(); - } - } + break; + } + } + + if (param_.test_interval() && iter_ % param_.test_interval() == 0 + && (iter_ > 0 || param_.test_initialization())) { + TestAll(); + } + + const bool display = param_.display() && iter_ % param_.display() == 0; + net_->set_debug_info(display && param_.debug_info()); + // accumulate the loss and gradient + Dtype loss = 0; + for (int i = 0; i < param_.iter_size(); ++i) { + loss += net_->ForwardBackward(bottom_vec); + } + loss /= param_.iter_size(); + // average the loss across iterations for smoothed reporting + if (losses.size() < average_loss) { + losses.push_back(loss); + int size = losses.size(); + smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; + } else { + int idx = (iter_ - start_iter) % average_loss; + smoothed_loss += (loss - losses[idx]) / average_loss; + losses[idx] = loss; + printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, + losses[idx], idx); + } + printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", + smoothed_loss, average_loss, losses.size()); + if (display) { + LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; + const vector*>& result = net_->output_blobs(); + int score_index = 0; + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + const string& output_name = + net_->blob_names()[net_->output_blob_indices()[j]]; + const Dtype loss_weight = + net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + for (int k = 0; k < result[j]->count(); ++k) { + ostringstream loss_msg_stream; + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight + << " = " << loss_weight * result_vec[k] << " loss)"; + } + LOG(INFO) << " Train net output #" + << score_index++ << ": " << output_name << " = " + << result_vec[k] << loss_msg_stream.str(); + } + } + } + ApplyUpdate(); + + // Increment the internal iter_ counter -- its value should always indicate + // the number of times the weights have been updated. + ++iter_; + + // Save a snapshot if needed. + if (param_.snapshot() && iter_ % param_.snapshot() == 0) { + Snapshot(); + } + } } -template +template void Solver::Solve(const char* resume_file) { - LOG(INFO) << "Solving " << net_->name(); - LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); - - if (resume_file) { - LOG(INFO) << "Restoring previous solver status from " << resume_file; - Restore(resume_file); - } - - // For a network that is trained by the solver, no bottom or top vecs - // should be given, and we will just provide dummy vecs. - Step(param_.max_iter() - iter_); - // If we haven't already, save a snapshot after optimization, unless - // overridden by setting snapshot_after_train := false - if (param_.snapshot_after_train() - && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { - Snapshot(); - } - // After the optimization is done, run an additional train and test pass to - // display the train and test loss/outputs if appropriate (based on the - // display and test_interval settings, respectively). Unlike in the rest of - // training, for the train net we only run a forward pass as we've already - // updated the parameters "max_iter" times -- this final pass is only done to - // display the loss, which is computed in the forward pass. - if (param_.display() && iter_ % param_.display() == 0) { - Dtype loss; - net_->ForwardPrefilled(&loss); - LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; - } - if (param_.test_interval() && iter_ % param_.test_interval() == 0) { - TestAll(); - } - LOG(INFO) << "Optimization Done."; + LOG(INFO) << "Solving " << net_->name(); + LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); + + if (resume_file) { + LOG(INFO) << "Restoring previous solver status from " << resume_file; + Restore(resume_file); + } + + // For a network that is trained by the solver, no bottom or top vecs + // should be given, and we will just provide dummy vecs. + Step(param_.max_iter() - iter_); + // If we haven't already, save a snapshot after optimization, unless + // overridden by setting snapshot_after_train := false + if (param_.snapshot_after_train() + && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { + Snapshot(); + } + // After the optimization is done, run an additional train and test pass to + // display the train and test loss/outputs if appropriate (based on the + // display and test_interval settings, respectively). Unlike in the rest of + // training, for the train net we only run a forward pass as we've already + // updated the parameters "max_iter" times -- this final pass is only done to + // display the loss, which is computed in the forward pass. + if (param_.display() && iter_ % param_.display() == 0) { + Dtype loss; + net_->ForwardPrefilled(&loss); + LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; + } + if (param_.test_interval() && iter_ % param_.test_interval() == 0) { + TestAll(); + } + LOG(INFO) << "Optimization Done."; } - -template +template void Solver::TestAll() { - for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { - Test(test_net_id); - } + for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { + Test(test_net_id); + } } -template +template void Solver::Test(const int test_net_id) { - LOG(INFO) << "Iteration " << iter_ - << ", Testing net (#" << test_net_id << ")"; - CHECK_NOTNULL(test_nets_[test_net_id].get())-> - ShareTrainedLayersWith(net_.get()); - vector test_score; - vector test_score_output_id; - vector*> bottom_vec; - const shared_ptr >& test_net = test_nets_[test_net_id]; - Dtype loss = 0; - for (int i = 0; i < param_.test_iter(test_net_id); ++i) { - Dtype iter_loss; - const vector*>& result = - test_net->Forward(bottom_vec, &iter_loss); - if (param_.test_compute_loss()) { - loss += iter_loss; - } - if (i == 0) { - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - for (int k = 0; k < result[j]->count(); ++k) { - test_score.push_back(result_vec[k]); - test_score_output_id.push_back(j); - } - } - } else { - int idx = 0; - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - for (int k = 0; k < result[j]->count(); ++k) { - test_score[idx++] += result_vec[k]; - } - } - } - } - if (param_.test_compute_loss()) { - loss /= param_.test_iter(test_net_id); - LOG(INFO) << "Test loss: " << loss; - } - for (int i = 0; i < test_score.size(); ++i) { - const int output_blob_index = - test_net->output_blob_indices()[test_score_output_id[i]]; - const string& output_name = test_net->blob_names()[output_blob_index]; - const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; - ostringstream loss_msg_stream; - const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * mean_score << " loss)"; - } - LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " - << mean_score << loss_msg_stream.str(); - } + LOG(INFO) << "Iteration " << iter_ + << ", Testing net (#" << test_net_id << ")"; + CHECK_NOTNULL(test_nets_[test_net_id].get())-> + ShareTrainedLayersWith(net_.get()); + vector < Dtype > test_score; + vector test_score_output_id; + vector*> bottom_vec; + const shared_ptr >& test_net = test_nets_[test_net_id]; + Dtype loss = 0; + for (int i = 0; i < param_.test_iter(test_net_id); ++i) { + Dtype iter_loss; + const vector*>& result = + test_net->Forward(bottom_vec, &iter_loss); + if (param_.test_compute_loss()) { + loss += iter_loss; + } + if (i == 0) { + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + for (int k = 0; k < result[j]->count(); ++k) { + test_score.push_back(result_vec[k]); + test_score_output_id.push_back(j); + } + } + } else { + int idx = 0; + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + for (int k = 0; k < result[j]->count(); ++k) { + test_score[idx++] += result_vec[k]; + } + } + } + } + if (param_.test_compute_loss()) { + loss /= param_.test_iter(test_net_id); + LOG(INFO) << "Test loss: " << loss; + } + for (int i = 0; i < test_score.size(); ++i) { + const int output_blob_index = + test_net->output_blob_indices()[test_score_output_id[i]]; + const string& output_name = test_net->blob_names()[output_blob_index]; + const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; + ostringstream loss_msg_stream; + const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight + << " = " << loss_weight * mean_score << " loss)"; + } + LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " + << mean_score << loss_msg_stream.str(); + } } - -template +template void Solver::Snapshot() { - NetParameter net_param; - // For intermediate results, we will also dump the gradient values. - net_->ToProto(&net_param, param_.snapshot_diff()); - string filename(param_.snapshot_prefix()); - string model_filename, snapshot_filename; - const int kBufferSize = 20; - char iter_str_buffer[kBufferSize]; - snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_); - filename += iter_str_buffer; - model_filename = filename + ".caffemodel"; - LOG(INFO) << "Snapshotting to " << model_filename; - WriteProtoToBinaryFile(net_param, model_filename.c_str()); - SolverState state; - SnapshotSolverState(&state); - state.set_iter(iter_); - state.set_learned_net(model_filename); - state.set_current_step(current_step_); - snapshot_filename = filename + ".solverstate"; - LOG(INFO) << "Snapshotting solver state to " << snapshot_filename; - WriteProtoToBinaryFile(state, snapshot_filename.c_str()); + NetParameter net_param; + // For intermediate results, we will also dump the gradient values. + net_->ToProto(&net_param, param_.snapshot_diff()); + string filename(param_.snapshot_prefix()); + string model_filename, snapshot_filename; + const int kBufferSize = 20; + char iter_str_buffer[kBufferSize]; + snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_); + filename += iter_str_buffer; + model_filename = filename + ".caffemodel"; + LOG(INFO) << "Snapshotting to " << model_filename; + WriteProtoToBinaryFile(net_param, model_filename.c_str()); + SolverState state; + SnapshotSolverState(&state); + state.set_iter(iter_); + state.set_learned_net(model_filename); + state.set_current_step(current_step_); + snapshot_filename = filename + ".solverstate"; + LOG(INFO) << "Snapshotting solver state to " << snapshot_filename; + WriteProtoToBinaryFile(state, snapshot_filename.c_str()); } -template +template void Solver::Restore(const char* state_file) { - SolverState state; - NetParameter net_param; - ReadProtoFromBinaryFile(state_file, &state); - if (state.has_learned_net()) { - ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param); - net_->CopyTrainedLayersFrom(net_param); - } - iter_ = state.iter(); - current_step_ = state.current_step(); - RestoreSolverState(state); + SolverState state; + NetParameter net_param; + ReadProtoFromBinaryFile(state_file, &state); + if (state.has_learned_net()) { + ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param); + net_->CopyTrainedLayersFrom(net_param); + } + iter_ = state.iter(); + current_step_ = state.current_step(); + RestoreSolverState(state); } - // Return the current learning rate. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. @@ -420,385 +419,389 @@ void Solver::Restore(const char* state_file) { // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. -template +template Dtype SGDSolver::GetLearningRate() { - Dtype rate; - const string& lr_policy = this->param_.lr_policy(); - if (lr_policy == "fixed") { - rate = this->param_.base_lr(); - } else if (lr_policy == "step") { - this->current_step_ = this->iter_ / this->param_.stepsize(); - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); - } else if (lr_policy == "exp") { - rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); - } else if (lr_policy == "inv") { - rate = this->param_.base_lr() * - pow(Dtype(1) + this->param_.gamma() * this->iter_, - - this->param_.power()); - } else if (lr_policy == "multistep") { - if (this->current_step_ < this->param_.stepvalue_size() && - this->iter_ >= this->param_.stepvalue(this->current_step_)) { - this->current_step_++; - LOG(INFO) << "MultiStep Status: Iteration " << - this->iter_ << ", step = " << this->current_step_; - } - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); - } else if (lr_policy == "poly") { - rate = this->param_.base_lr() * pow(Dtype(1.) - - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - this->param_.power()); - } else if (lr_policy == "sigmoid") { - rate = this->param_.base_lr() * (Dtype(1.) / - (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - Dtype(this->param_.stepsize()))))); - } else { - LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; - } - return rate; + Dtype rate; + const string& lr_policy = this->param_.lr_policy(); + if (lr_policy == "fixed") { + rate = this->param_.base_lr(); + } else if (lr_policy == "step") { + this->current_step_ = this->iter_ / this->param_.stepsize(); + rate = this->param_.base_lr() * + pow(this->param_.gamma(), this->current_step_); + } else if (lr_policy == "exp") { + rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); + } else if (lr_policy == "inv") { + rate = this->param_.base_lr() * + pow(Dtype(1) + this->param_.gamma() * this->iter_, + -this->param_.power()); + } else if (lr_policy == "multistep") { + if (this->current_step_ < this->param_.stepvalue_size() && + this->iter_ >= this->param_.stepvalue(this->current_step_)) { + this->current_step_++; + LOG(INFO) << "MultiStep Status: Iteration " << + this->iter_ << ", step = " << this->current_step_; + } + rate = this->param_.base_lr() * + pow(this->param_.gamma(), this->current_step_); + } else if (lr_policy == "poly") { + rate = this->param_.base_lr() * pow(Dtype(1.) - + (Dtype(this->iter_) / Dtype(this->param_.max_iter())), + this->param_.power()); + } else if (lr_policy == "sigmoid") { + rate = this->param_.base_lr() * (Dtype(1.) / + (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - + Dtype(this->param_.stepsize()))))); + } else { + LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; + } + return rate; } -template +template void SGDSolver::PreSolve() { - // Initialize the history - const vector > >& net_params = this->net_->params(); - history_.clear(); - update_.clear(); - temp_.clear(); - for (int i = 0; i < net_params.size(); ++i) { - const vector& shape = net_params[i]->shape(); - history_.push_back(shared_ptr >(new Blob(shape))); - update_.push_back(shared_ptr >(new Blob(shape))); - temp_.push_back(shared_ptr >(new Blob(shape))); - } + // Initialize the history + const vector > >& net_params = this->net_->params(); + history_.clear(); + update_.clear(); + temp_.clear(); + for (int i = 0; i < net_params.size(); ++i) { + const vector& shape = net_params[i]->shape(); + history_.push_back(shared_ptr < Blob > (new Blob(shape))); + update_.push_back(shared_ptr < Blob > (new Blob(shape))); + temp_.push_back(shared_ptr < Blob > (new Blob(shape))); + } } -template +template void SGDSolver::ClipGradients() { - const Dtype clip_gradients = this->param_.clip_gradients(); - if (clip_gradients < 0) { return; } - const vector > >& net_params = this->net_->params(); - Dtype sumsq_diff = 0; - for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - sumsq_diff += net_params[i]->sumsq_diff(); - } - } - const Dtype l2norm_diff = std::sqrt(sumsq_diff); - if (l2norm_diff > clip_gradients) { - Dtype scale_factor = clip_gradients / l2norm_diff; - LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " - << l2norm_diff << " > " << clip_gradients << ") " - << "by scale factor " << scale_factor; - for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - net_params[i]->scale_diff(scale_factor); - } - } - } + const Dtype clip_gradients = this->param_.clip_gradients(); + if (clip_gradients < 0) { + return; + } + const vector > >& net_params = this->net_->params(); + Dtype sumsq_diff = 0; + for (int i = 0; i < net_params.size(); ++i) { + if (this->net_->param_owners()[i] < 0) { + sumsq_diff += net_params[i]->sumsq_diff(); + } + } + const Dtype l2norm_diff = std::sqrt(sumsq_diff); + if (l2norm_diff > clip_gradients) { + Dtype scale_factor = clip_gradients / l2norm_diff; + LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " + << l2norm_diff << " > " << clip_gradients << ") " + << "by scale factor " << scale_factor; + for (int i = 0; i < net_params.size(); ++i) { + if (this->net_->param_owners()[i] < 0) { + net_params[i]->scale_diff(scale_factor); + } + } + } } -template +template void SGDSolver::ApplyUpdate() { - Dtype rate = GetLearningRate(); - if (this->param_.display() && this->iter_ % this->param_.display() == 0) { - LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; - } - ClipGradients(); - for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { - Normalize(param_id); - Regularize(param_id); - ComputeUpdateValue(param_id, rate); - } - this->net_->Update(); + Dtype rate = GetLearningRate(); + if (this->param_.display() && this->iter_ % this->param_.display() == 0) { + LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; + } + ClipGradients(); + for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { + Normalize(param_id); + Regularize(param_id); + ComputeUpdateValue(param_id, rate); + } + this->net_->Update(); } -template +template void SGDSolver::Normalize(int param_id) { - if (this->param_.iter_size() == 1) { return; } - // Scale gradient to counterbalance accumulation. - const vector > >& net_params = this->net_->params(); - const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); - switch (Caffe::mode()) { - case Caffe::CPU: { - caffe_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + if (this->param_.iter_size() == 1) { + return; + } + // Scale gradient to counterbalance accumulation. + const vector > >& net_params = this->net_->params(); + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); + switch (Caffe::mode()) { + case Caffe::CPU: { + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_gpu_diff()); + caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } -template +template void SGDSolver::Regularize(int param_id) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_weight_decay = - this->net_->params_weight_decay(); - Dtype weight_decay = this->param_.weight_decay(); - string regularization_type = this->param_.regularization_type(); - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - switch (Caffe::mode()) { - case Caffe::CPU: { - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + switch (Caffe::mode()) { + case Caffe::CPU: { + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } -template +template void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype momentum = this->param_.momentum(); - Dtype local_rate = rate * net_params_lr[param_id]; - // Compute the update to history, then copy it to the parameter diff. - switch (Caffe::mode()) { - case Caffe::CPU: { - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; + // Compute the update to history, then copy it to the parameter diff. + switch (Caffe::mode()) { + case Caffe::CPU: { + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - caffe_gpu_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + history_[param_id]->mutable_gpu_data()); + caffe_gpu_copy(net_params[param_id]->count(), + history_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } -template +template void SGDSolver::SnapshotSolverState(SolverState* state) { - state->clear_history(); - for (int i = 0; i < history_.size(); ++i) { - // Add history - BlobProto* history_blob = state->add_history(); - history_[i]->ToProto(history_blob); - } + state->clear_history(); + for (int i = 0; i < history_.size(); ++i) { + // Add history + BlobProto* history_blob = state->add_history(); + history_[i]->ToProto(history_blob); + } } -template +template void SGDSolver::RestoreSolverState(const SolverState& state) { - CHECK_EQ(state.history_size(), history_.size()) - << "Incorrect length of history blobs."; - LOG(INFO) << "SGDSolver: restoring history"; - for (int i = 0; i < history_.size(); ++i) { - history_[i]->FromProto(state.history(i)); - } + CHECK_EQ(state.history_size(), history_.size()) + << "Incorrect length of history blobs."; + LOG(INFO) << "SGDSolver: restoring history"; + for (int i = 0; i < history_.size(); ++i) { + history_[i]->FromProto(state.history(i)); + } } -template +template void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype momentum = this->param_.momentum(); - Dtype local_rate = rate * net_params_lr[param_id]; - switch (Caffe::mode()) { - case Caffe::CPU: { - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); - - // compute update: step back then over step - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->cpu_data(), -momentum, - this->update_[param_id]->mutable_cpu_data()); - - // copy - caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; + switch (Caffe::mode()) { + case Caffe::CPU: { + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // update history + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + this->history_[param_id]->mutable_cpu_data()); + + // compute update: step back then over step + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->cpu_data(), -momentum, + this->update_[param_id]->mutable_cpu_data()); + + // copy + caffe_copy(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - this->history_[param_id]->mutable_gpu_data()); - - // compute update: step back then over step - caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->gpu_data(), -momentum, - this->update_[param_id]->mutable_gpu_data()); - - // copy - caffe_gpu_copy(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // update history + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + this->history_[param_id]->mutable_gpu_data()); + + // compute update: step back then over step + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->gpu_data(), -momentum, + this->update_[param_id]->mutable_gpu_data()); + + // copy + caffe_gpu_copy(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } -template +template void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype delta = this->param_.delta(); - Dtype local_rate = rate * net_params_lr[param_id]; - switch (Caffe::mode()) { - case Caffe::CPU: { - // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_add(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - this->history_[param_id]->cpu_data(), - this->history_[param_id]->mutable_cpu_data()); - - // prepare update - caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); - - caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); - - caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // scale and copy - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->cpu_data(), Dtype(0), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype delta = this->param_.delta(); + Dtype local_rate = rate * net_params_lr[param_id]; + switch (Caffe::mode()) { + case Caffe::CPU: { + // compute square of gradient in update + caffe_powx(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); + + // update history + caffe_add(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + this->history_[param_id]->cpu_data(), + this->history_[param_id]->mutable_cpu_data()); + + // prepare update + caffe_powx(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); + + caffe_add_scalar(net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_cpu_data()); + + caffe_div(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // scale and copy + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->cpu_data(), Dtype(0), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_add(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - this->history_[param_id]->gpu_data(), - this->history_[param_id]->mutable_gpu_data()); - - // prepare update - caffe_gpu_powx( net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // scale and copy - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->gpu_data(), Dtype(0), - net_params[param_id]->mutable_gpu_diff()); + // compute square of gradient in update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); + + // update history + caffe_gpu_add(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + this->history_[param_id]->gpu_data(), + this->history_[param_id]->mutable_gpu_data()); + + // prepare update + caffe_gpu_powx(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_div(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), + this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // scale and copy + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->gpu_data(), Dtype(0), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } -INSTANTIATE_CLASS(Solver); -INSTANTIATE_CLASS(SGDSolver); -INSTANTIATE_CLASS(NesterovSolver); -INSTANTIATE_CLASS(AdaGradSolver); +INSTANTIATE_CLASS (Solver); +INSTANTIATE_CLASS (SGDSolver); +INSTANTIATE_CLASS (NesterovSolver); +INSTANTIATE_CLASS (AdaGradSolver); } // namespace caffe diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 8cf9bc7b..94d62e0e 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -36,143 +36,156 @@ namespace caffe { SyncedMemory::~SyncedMemory() { -if (cpu_ptr_ && own_cpu_data_) { - OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL) ); - clFinish(amdDevice.CommandQueue); - } - if(gpu_cache_ptr_ && own_cpu_data_) { - OCL_CHECK( clReleaseMemObject((cl_mem)gpu_cache_ptr_) ); - } - if (gpu_ptr_) { - OCL_CHECK( clReleaseMemObject((cl_mem)gpu_ptr_) ); - } - - clReleaseKernel(oclmem_kernel); -} + if (cpu_ptr_ && own_cpu_data_) { + OCL_CHECK( + clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + cpu_ptr_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + } + if (gpu_cache_ptr_ && own_cpu_data_) { + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_)); + } + if (gpu_ptr_) { + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_)); + } + + clReleaseKernel (oclmem_kernel); +} void SyncedMemory::ocl_setup() { - cl_int err=0; - oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); - OCL_CHECK(err); + cl_int err = 0; + oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + OCL_CHECK(err); } inline void SyncedMemory::to_cpu() { -switch (head_) { - case UNINITIALIZED: - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); - //} - cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); - memset(cpu_ptr_, 0, size_); - head_ = HEAD_AT_CPU; - own_cpu_data_ = true; - break; - case HEAD_AT_GPU:{ + switch (head_) { + case UNINITIALIZED: + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + size_, NULL, NULL); + //} + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, + 0, NULL, NULL, NULL); + memset(cpu_ptr_, 0, size_); + head_ = HEAD_AT_CPU; + own_cpu_data_ = true; + break; + case HEAD_AT_GPU: { #ifndef CPU_ONLY - if (cpu_ptr_ == NULL) { - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); - cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); - own_cpu_data_ = true; - } - OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_ptr_, (cl_mem)gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); - head_ = SYNCED; + if (cpu_ptr_ == NULL) { + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, + CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_, 0, NULL, NULL, NULL); + own_cpu_data_ = true; + } + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, + (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + head_ = SYNCED; #else - NO_GPU; + NO_GPU; #endif #ifdef Track_data_transfer - LOG(WARNING) << "sync: data from GPU to CPU"; + LOG(WARNING) << "sync: data from GPU to CPU"; #endif - break; - } - case HEAD_AT_CPU: - case SYNCED: - break; - } + break; + } + case HEAD_AT_CPU: + case SYNCED: + break; + } } inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY -switch (head_) { - case UNINITIALIZED:{ - cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL); - if(NULL == tmpMem){ - fprintf(stderr,"Failed to create memory object\n"); - break; - } - ocl_memset(oclmem_kernel, tmpMem, (int)0, (int)(size_/sizeof(int))); - gpu_ptr_ = (void*)tmpMem; - head_ = HEAD_AT_GPU; - break; - } - case HEAD_AT_CPU:{ - if (gpu_ptr_ == NULL) { - cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL); - if(NULL == tmpMem){ - fprintf(stderr,"Failed to create memory object\n"); - } - gpu_ptr_ = (void*)tmpMem; - } - OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, (cl_mem)gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); - head_ = SYNCED; + switch (head_) { + case UNINITIALIZED: { + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + size_, NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + break; + } + ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int))); + gpu_ptr_ = (void*) tmpMem; + head_ = HEAD_AT_GPU; + break; + } + case HEAD_AT_CPU: { + if (gpu_ptr_ == NULL) { + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + size_, NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + } + gpu_ptr_ = (void*) tmpMem; + } + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + head_ = SYNCED; #ifdef Track_data_transfer - LOG(WARNING) << "sync: data from CPU to GPU"; + LOG(WARNING) << "sync: data from CPU to GPU"; #endif - break; - } - case HEAD_AT_GPU: - case SYNCED: - break; - } + break; + } + case HEAD_AT_GPU: + case SYNCED: + break; + } #else - NO_GPU; + NO_GPU; #endif } const void* SyncedMemory::cpu_data() { - to_cpu(); - return (const void*)cpu_ptr_; + to_cpu(); + return (const void*) cpu_ptr_; } void SyncedMemory::set_cpu_data(void* data) { - CHECK(data); - if (own_cpu_data_) { - CaffeFreeHost(cpu_ptr_); - } - cpu_ptr_ = data; - head_ = HEAD_AT_CPU; - own_cpu_data_ = false; + CHECK(data); + if (own_cpu_data_) { + CaffeFreeHost (cpu_ptr_); + } + cpu_ptr_ = data; + head_ = HEAD_AT_CPU; + own_cpu_data_ = false; } const void* SyncedMemory::gpu_data() { #ifndef CPU_ONLY - to_gpu(); - return (const void*)gpu_ptr_; + to_gpu(); + return (const void*) gpu_ptr_; #else - NO_GPU; + NO_GPU; #endif } void* SyncedMemory::mutable_cpu_data() { - to_cpu(); - head_ = HEAD_AT_CPU; - return cpu_ptr_; + to_cpu(); + head_ = HEAD_AT_CPU; + return cpu_ptr_; } void* SyncedMemory::mutable_gpu_data() { #ifndef CPU_ONLY - to_gpu(); - head_ = HEAD_AT_GPU; - return gpu_ptr_; + to_gpu(); + head_ = HEAD_AT_GPU; + return gpu_ptr_; #else - NO_GPU; + NO_GPU; #endif } const void *SyncedMemory::gpu_cache_data() { - return 0; + return 0; } - } // namespace caffe diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 0383fd27..7d0a85aa 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -6,114 +6,113 @@ namespace caffe { Timer::Timer() - : initted_(false), - running_(false), - has_run_at_least_once_(false) { - Init(); + : initted_(false), + running_(false), + has_run_at_least_once_(false) { + Init(); } Timer::~Timer() { } void Timer::Start() { - if (!running()) { - start_cpu_ = boost::posix_time::microsec_clock::local_time(); - running_ = true; - has_run_at_least_once_ = true; - } + if (!running()) { + start_cpu_ = boost::posix_time::microsec_clock::local_time(); + running_ = true; + has_run_at_least_once_ = true; + } } void Timer::Stop() { - if (running()) { - stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - running_ = false; - } + if (running()) { + stop_cpu_ = boost::posix_time::microsec_clock::local_time(); + running_ = false; + } } - float Timer::MicroSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - - elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); - return elapsed_microseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + + elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); + return elapsed_microseconds_; } float Timer::MilliSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - - elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); - return elapsed_milliseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + + elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); + return elapsed_milliseconds_; } float Timer::Seconds() { - return MilliSeconds() / 1000.; + return MilliSeconds() / 1000.; } void Timer::Init() { - if (!initted()) { - if (Caffe::mode() == Caffe::GPU) { - } - initted_ = true; - } + if (!initted()) { + if (Caffe::mode() == Caffe::GPU) { + } + initted_ = true; + } } CPUTimer::CPUTimer() { - this->initted_ = true; - this->running_ = false; - this->has_run_at_least_once_ = false; + this->initted_ = true; + this->running_ = false; + this->has_run_at_least_once_ = false; } void CPUTimer::Start() { - if (!running()) { - this->start_cpu_ = boost::posix_time::microsec_clock::local_time(); - this->running_ = true; - this->has_run_at_least_once_ = true; - } + if (!running()) { + this->start_cpu_ = boost::posix_time::microsec_clock::local_time(); + this->running_ = true; + this->has_run_at_least_once_ = true; + } } void CPUTimer::Stop() { - if (running()) { - this->stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - this->running_ = false; - } + if (running()) { + this->stop_cpu_ = boost::posix_time::microsec_clock::local_time(); + this->running_ = false; + } } float CPUTimer::MilliSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - this->elapsed_milliseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_milliseconds(); - return this->elapsed_milliseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + this->elapsed_milliseconds_ = (this->stop_cpu_ - + this->start_cpu_).total_milliseconds(); + return this->elapsed_milliseconds_; } float CPUTimer::MicroSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - this->elapsed_microseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_microseconds(); - return this->elapsed_microseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + this->elapsed_microseconds_ = (this->stop_cpu_ - + this->start_cpu_).total_microseconds(); + return this->elapsed_microseconds_; } } // namespace caffe diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp index 1772f009..43492ce7 100644 --- a/src/caffe/util/cudnn.cpp +++ b/src/caffe/util/cudnn.cpp @@ -2,22 +2,22 @@ #include "caffe/util/cudnn.hpp" namespace caffe { -namespace cudnn { + namespace cudnn { -float dataType::oneval = 1.0; -float dataType::zeroval = 0.0; -const void* dataType::one = - static_cast(&dataType::oneval); -const void* dataType::zero = - static_cast(&dataType::zeroval); + float dataType::oneval = 1.0; + float dataType::zeroval = 0.0; + const void* dataType::one = + static_cast(&dataType::oneval); + const void* dataType::zero = + static_cast(&dataType::zeroval); -double dataType::oneval = 1.0; -double dataType::zeroval = 0.0; -const void* dataType::one = - static_cast(&dataType::oneval); -const void* dataType::zero = - static_cast(&dataType::zeroval); + double dataType::oneval = 1.0; + double dataType::zeroval = 0.0; + const void* dataType::one = + static_cast(&dataType::oneval); + const void* dataType::zero = + static_cast(&dataType::zeroval); -} // namespace cudnn + } // namespace cudnn } // namespace caffe #endif diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp index f55420e9..50d8cbf7 100644 --- a/src/caffe/util/db.cpp +++ b/src/caffe/util/db.cpp @@ -4,27 +4,28 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { DB* GetDB(DataParameter::DB backend) { - switch (backend) { - case DataParameter_DB_LEVELDB: - return new LevelDB(); - case DataParameter_DB_LMDB: - return new LMDB(); - default: - LOG(FATAL) << "Unknown database backend"; - } + switch (backend) { + case DataParameter_DB_LEVELDB: + return new LevelDB(); + case DataParameter_DB_LMDB: + return new LMDB(); + default: + LOG(FATAL) << "Unknown database backend"; + } } DB* GetDB(const string& backend) { - if (backend == "leveldb") { - return new LevelDB(); - } else if (backend == "lmdb") { - return new LMDB(); - } else { - LOG(FATAL) << "Unknown database backend"; - } + if (backend == "leveldb") { + return new LevelDB(); + } else if (backend == "lmdb") { + return new LMDB(); + } else { + LOG(FATAL) << "Unknown database backend"; + } } } // namespace db diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp index 06c46627..aec747af 100644 --- a/src/caffe/util/db_leveldb.cpp +++ b/src/caffe/util/db_leveldb.cpp @@ -2,19 +2,20 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { void LevelDB::Open(const string& source, Mode mode) { - leveldb::Options options; - options.block_size = 65536; - options.write_buffer_size = 268435456; - options.max_open_files = 100; - options.error_if_exists = mode == NEW; - options.create_if_missing = mode != READ; - leveldb::Status status = leveldb::DB::Open(options, source, &db_); - CHECK(status.ok()) << "Failed to open leveldb " << source - << std::endl << status.ToString(); - LOG(INFO) << "Opened leveldb " << source; + leveldb::Options options; + options.block_size = 65536; + options.write_buffer_size = 268435456; + options.max_open_files = 100; + options.error_if_exists = mode == NEW; + options.create_if_missing = mode != READ; + leveldb::Status status = leveldb::DB::Open(options, source, &db_); + CHECK(status.ok()) << "Failed to open leveldb " << source + << std::endl << status.ToString(); + LOG(INFO) << "Opened leveldb " << source; } } // namespace db diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp index a054b796..bc1a0da1 100644 --- a/src/caffe/util/db_lmdb.cpp +++ b/src/caffe/util/db_lmdb.cpp @@ -4,47 +4,48 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { const size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB void LMDB::Open(const string& source, Mode mode) { - MDB_CHECK(mdb_env_create(&mdb_env_)); - MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE)); - if (mode == NEW) { - CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed"; - } - int flags = 0; - if (mode == READ) { - flags = MDB_RDONLY | MDB_NOTLS; - } - MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664)); - LOG(INFO) << "Opened lmdb " << source; + MDB_CHECK(mdb_env_create(&mdb_env_)); + MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE)); + if(mode == NEW) { + CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed"; + } + int flags = 0; + if (mode == READ) { + flags = MDB_RDONLY | MDB_NOTLS; + } + MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664)); + LOG(INFO) << "Opened lmdb " << source; } LMDBCursor* LMDB::NewCursor() { - MDB_txn* mdb_txn; - MDB_cursor* mdb_cursor; - MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn)); - MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); - MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor)); - return new LMDBCursor(mdb_txn, mdb_cursor); + MDB_txn* mdb_txn; + MDB_cursor* mdb_cursor; + MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn)); + MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); + MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor)); + return new LMDBCursor(mdb_txn, mdb_cursor); } LMDBTransaction* LMDB::NewTransaction() { - MDB_txn* mdb_txn; - MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn)); - MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); - return new LMDBTransaction(&mdb_dbi_, mdb_txn); + MDB_txn* mdb_txn; + MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn)); + MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); + return new LMDBTransaction(&mdb_dbi_, mdb_txn); } void LMDBTransaction::Put(const string& key, const string& value) { - MDB_val mdb_key, mdb_value; - mdb_key.mv_data = const_cast(key.data()); - mdb_key.mv_size = key.size(); - mdb_value.mv_data = const_cast(value.data()); - mdb_value.mv_size = value.size(); - MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0)); + MDB_val mdb_key, mdb_value; + mdb_key.mv_data = const_cast(key.data()); + mdb_key.mv_size = key.size(); + mdb_value.mv_data = const_cast(value.data()); + mdb_value.mv_size = value.size(); + MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0)); } } // namespace db diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 089023b7..69cc47bc 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -34,330 +34,352 @@ namespace caffe { -template extern std::string get_dtype_suffix(); +template extern std::string get_dtype_suffix(); -template +template void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int channels_col = channels * kernel_h * kernel_w; - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % kernel_w; - int h_offset = (c / kernel_w) % kernel_h; - int c_im = c / kernel_h / kernel_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride_h - pad_h + h_offset; - int w_pad = w * stride_w - pad_w + w_offset; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; - else - data_col[(c * height_col + h) * width_col + w] = 0; - } - } - } + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_col[(c * height_col + h) * width_col + w] = + data_im[(c_im * height + h_pad) * width + w_pad]; + else + data_col[(c * height_col + h) * width_col + w] = 0; + } + } + } } template void im2col_cpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_col); template void im2col_cpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_col); -template +template void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im) { - caffe_set(height * width * channels, Dtype(0), data_im); - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int channels_col = channels * patch_h * patch_w; - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % patch_w; - int h_offset = (c / patch_w) % patch_h; - int c_im = c / patch_h / patch_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride_h - pad_h + h_offset; - int w_pad = w * stride_w - pad_w + w_offset; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_im[(c_im * height + h_pad) * width + w_pad] += - data_col[(c * height_col + h) * width_col + w]; - } - } - } + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im) { + caffe_set(height * width * channels, Dtype(0), data_im); + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int channels_col = channels * patch_h * patch_w; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % patch_w; + int h_offset = (c / patch_w) % patch_h; + int c_im = c / patch_h / patch_w; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_im[(c_im * height + h_pad) * width + w_pad] += + data_col[(c * height_col + h) * width_col + w]; + } + } + } } template void col2im_cpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_cpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); - - -template -void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum){ - std::string kernel_name = "col2im_opt" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset); - ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); + +template +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, int optnum) { + std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void col2im_gpu_opt(const float* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_im, const int img_offset, int optnum); -template void col2im_gpu_opt(const double* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_im, const int img_offset, int optnum); - -template -void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset) -{ - std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&kernel_h); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&kernel_w); - - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad_h); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_w); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&stride_h); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_w); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +template void col2im_gpu_opt(const float* data_col, const int col_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_im, const int img_offset, int optnum); +template void col2im_gpu_opt(const double* data_col, + const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_im, const int img_offset, int optnum); + +template +void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset) + { + std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w); + + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void im2col_gpu(const float* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col, const int col_offset); -template void im2col_gpu(const double* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col, const int col_offset); - -template +template void im2col_gpu(const float* data_im, const int img_offset, + const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col, const int col_offset); +template void im2col_gpu(const double* data_im, const int img_offset, + const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col, const int col_offset); + +template void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset) -{ - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&patch_h); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&patch_w); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_h); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&pad_w); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_h); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&stride_w); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset) + { + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu(const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, float* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, float* data_im, const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w,const int stride_h, const int stride_w, - double* data_im, const int img_offset); - -template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); - clFinish(amdDevice.CommandQueue); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im, const int img_offset); + +template +void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); } -template void im2col_gpu(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset); -template void im2col_gpu(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset); - -template -void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum) { - - std::string kernel_name = "im2col_opt" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = optnum * channels * height_col * width_col; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset); - ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {(size_t)(256 - 256 % width_col)}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +template void im2col_gpu(cl_kernel Kernel, const float* data_im, + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset); +template void im2col_gpu(cl_kernel Kernel, const double* data_im, + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset); + +template +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, int optnum) { + + std::string kernel_name = "im2col_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = optnum * channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void im2col_gpu_opt(const float* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset, int optnum); -template void im2col_gpu_opt(const double* data_im, const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset, int optnum); +template void im2col_gpu_opt(const float* data_im, const int img_offset, + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset, int optnum); +template void im2col_gpu_opt(const double* data_im, + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset, int optnum); -template +template void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset) { - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride); - ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col); - ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col); - ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } - -template void col2im_gpu(const float* data_col, const int col_offset, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im, const int img_offset); -template void col2im_gpu(const double* data_col, const int col_offset, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im, const int img_offset); - +template void col2im_gpu(const float* data_col, const int col_offset, + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, float* data_im, const int img_offset); +template void col2im_gpu(const double* data_col, const int col_offset, + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, double* data_im, const int img_offset); } // namespace caffe diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index d52acb54..be0ce3b4 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -10,125 +10,124 @@ namespace caffe { template __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_col) { - CUDA_KERNEL_LOOP(index, n) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - Dtype* data_col_ptr = data_col; - data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; - const Dtype* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + i; - int w = w_in + j; - *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; - data_col_ptr += height_col * width_col; - } - } - } + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_col) { + CUDA_KERNEL_LOOP(index, n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + Dtype* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + const Dtype* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } } -template +template void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { - // We are going to launch channels * height_col * width_col kernels, each - // kernel responsible for copying a single-channel grid. - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int num_kernels = channels * height_col * width_col; - // NOLINT_NEXT_LINE(whitespace/operators) - im2col_gpu_kernel<<>>( - num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, - pad_w, stride_h, stride_w, height_col, - width_col, data_col); - CUDA_POST_KERNEL_CHECK; + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + // NOLINT_NEXT_LINE(whitespace/operators) + im2col_gpu_kernel<<>>( + num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, + pad_w, stride_h, stride_w, height_col, + width_col, data_col); + CUDA_POST_KERNEL_CHECK; } - // Explicit instantiation template void im2col_gpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_gpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_im) { - CUDA_KERNEL_LOOP(index, n) { - Dtype val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; - int w_col_end = min(w / stride_w + 1, width_col); - int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; - int h_col_end = min(h / stride_h + 1, height_col); - // equivalent implementation - int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; - int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_im) { + CUDA_KERNEL_LOOP(index, n) { + Dtype val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } -template +template void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im) { - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int num_kernels = channels * height * width; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operators) - col2im_gpu_kernel<<>>( - num_kernels, data_col, height, width, channels, patch_h, patch_w, - pad_h, pad_w, stride_h, stride_w, - height_col, width_col, data_im); - CUDA_POST_KERNEL_CHECK; + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im) { + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + col2im_gpu_kernel<<>>( + num_kernels, data_col, height, width, channels, patch_h, patch_w, + pad_h, pad_w, stride_h, stride_w, + height_col, width_col, data_im); + CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void col2im_gpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_gpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); } // namespace caffe diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp index 416f80ab..2fbad3a9 100644 --- a/src/caffe/util/insert_splits.cpp +++ b/src/caffe/util/insert_splits.cpp @@ -10,135 +10,135 @@ namespace caffe { void InsertSplits(const NetParameter& param, NetParameter* param_split) { - // Initialize by copying from the input NetParameter. - param_split->CopyFrom(param); - param_split->clear_layer(); - map > blob_name_to_last_top_idx; - map, pair > bottom_idx_to_source_top_idx; - map, int> top_idx_to_bottom_count; - map, float> top_idx_to_loss_weight; - map, int> top_idx_to_bottom_split_idx; - map layer_idx_to_layer_name; - layer_idx_to_layer_name[-1] = "input"; - // Determine the number of times each blob is used as an input (bottom) blob. - for (int i = 0; i < param.input_size(); ++i) { - const string& blob_name = param.input(i); - blob_name_to_last_top_idx[blob_name] = make_pair(-1, i); - } - for (int i = 0; i < param.layer_size(); ++i) { - const LayerParameter& layer_param = param.layer(i); - layer_idx_to_layer_name[i] = layer_param.name(); - for (int j = 0; j < layer_param.bottom_size(); ++j) { - const string& blob_name = layer_param.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { - LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; - } - const pair& bottom_idx = make_pair(i, j); - const pair& top_idx = blob_name_to_last_top_idx[blob_name]; - bottom_idx_to_source_top_idx[bottom_idx] = top_idx; - ++top_idx_to_bottom_count[top_idx]; - } - for (int j = 0; j < layer_param.top_size(); ++j) { - const string& blob_name = layer_param.top(j); - blob_name_to_last_top_idx[blob_name] = make_pair(i, j); - } - // A use of a top blob as a loss should be handled similarly to the use of - // a top blob as an input (bottom) blob to another layer. - const int last_loss = - std::min(layer_param.loss_weight_size(), layer_param.top_size()); - for (int j = 0; j < last_loss; ++j) { - const string& blob_name = layer_param.top(j); - const pair& top_idx = blob_name_to_last_top_idx[blob_name]; - top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j); - if (top_idx_to_loss_weight[top_idx]) { - ++top_idx_to_bottom_count[top_idx]; - } - } - } - // Create split layer for any input blobs used by other layer as bottom - // blobs more than once. - for (int i = 0; i < param.input_size(); ++i) { - const int split_count = top_idx_to_bottom_count[make_pair(-1, i)]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[-1]; - const string& blob_name = param.input(i); - LayerParameter* split_layer_param = param_split->add_layer(); - const float kZeroLossWeight = 0; - ConfigureSplitLayer(layer_name, blob_name, i, split_count, - kZeroLossWeight, split_layer_param); - } - } - for (int i = 0; i < param.layer_size(); ++i) { - LayerParameter* layer_param = param_split->add_layer(); - layer_param->CopyFrom(param.layer(i)); - // Replace any shared bottom blobs with split layer outputs. - for (int j = 0; j < layer_param->bottom_size(); ++j) { - const pair& top_idx = - bottom_idx_to_source_top_idx[make_pair(i, j)]; - const int split_count = top_idx_to_bottom_count[top_idx]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[top_idx.first]; - const string& blob_name = layer_param->bottom(j); - layer_param->set_bottom(j, SplitBlobName(layer_name, - blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); - } - } - // Create split layer for any top blobs used by other layer as bottom - // blobs more than once. - for (int j = 0; j < layer_param->top_size(); ++j) { - const pair& top_idx = make_pair(i, j); - const int split_count = top_idx_to_bottom_count[top_idx]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[i]; - const string& blob_name = layer_param->top(j); - LayerParameter* split_layer_param = param_split->add_layer(); - const float loss_weight = top_idx_to_loss_weight[top_idx]; - ConfigureSplitLayer(layer_name, blob_name, j, split_count, - loss_weight, split_layer_param); - if (loss_weight) { - layer_param->clear_loss_weight(); - top_idx_to_bottom_split_idx[top_idx]++; - } - } - } - } + // Initialize by copying from the input NetParameter. + param_split->CopyFrom(param); + param_split->clear_layer(); + map > blob_name_to_last_top_idx; + map, pair > bottom_idx_to_source_top_idx; + map, int> top_idx_to_bottom_count; + map, float> top_idx_to_loss_weight; + map, int> top_idx_to_bottom_split_idx; + map layer_idx_to_layer_name; + layer_idx_to_layer_name[-1] = "input"; + // Determine the number of times each blob is used as an input (bottom) blob. + for (int i = 0; i < param.input_size(); ++i) { + const string& blob_name = param.input(i); + blob_name_to_last_top_idx[blob_name] = make_pair(-1, i); + } + for (int i = 0; i < param.layer_size(); ++i) { + const LayerParameter& layer_param = param.layer(i); + layer_idx_to_layer_name[i] = layer_param.name(); + for (int j = 0; j < layer_param.bottom_size(); ++j) { + const string& blob_name = layer_param.bottom(j); + if (blob_name_to_last_top_idx.find(blob_name) == + blob_name_to_last_top_idx.end()) { + LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; + } + const pair& bottom_idx = make_pair(i, j); + const pair& top_idx = blob_name_to_last_top_idx[blob_name]; + bottom_idx_to_source_top_idx[bottom_idx] = top_idx; + ++top_idx_to_bottom_count[top_idx]; + } + for (int j = 0; j < layer_param.top_size(); ++j) { + const string& blob_name = layer_param.top(j); + blob_name_to_last_top_idx[blob_name] = make_pair(i, j); + } + // A use of a top blob as a loss should be handled similarly to the use of + // a top blob as an input (bottom) blob to another layer. + const int last_loss = + std::min(layer_param.loss_weight_size(), layer_param.top_size()); + for (int j = 0; j < last_loss; ++j) { + const string& blob_name = layer_param.top(j); + const pair& top_idx = blob_name_to_last_top_idx[blob_name]; + top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j); + if (top_idx_to_loss_weight[top_idx]) { + ++top_idx_to_bottom_count[top_idx]; + } + } + } + // Create split layer for any input blobs used by other layer as bottom + // blobs more than once. + for (int i = 0; i < param.input_size(); ++i) { + const int split_count = top_idx_to_bottom_count[make_pair(-1, i)]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[-1]; + const string& blob_name = param.input(i); + LayerParameter* split_layer_param = param_split->add_layer(); + const float kZeroLossWeight = 0; + ConfigureSplitLayer(layer_name, blob_name, i, split_count, + kZeroLossWeight, split_layer_param); + } + } + for (int i = 0; i < param.layer_size(); ++i) { + LayerParameter* layer_param = param_split->add_layer(); + layer_param->CopyFrom(param.layer(i)); + // Replace any shared bottom blobs with split layer outputs. + for (int j = 0; j < layer_param->bottom_size(); ++j) { + const pair& top_idx = + bottom_idx_to_source_top_idx[make_pair(i, j)]; + const int split_count = top_idx_to_bottom_count[top_idx]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[top_idx.first]; + const string& blob_name = layer_param->bottom(j); + layer_param->set_bottom(j, SplitBlobName(layer_name, + blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); + } + } + // Create split layer for any top blobs used by other layer as bottom + // blobs more than once. + for (int j = 0; j < layer_param->top_size(); ++j) { + const pair& top_idx = make_pair(i, j); + const int split_count = top_idx_to_bottom_count[top_idx]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[i]; + const string& blob_name = layer_param->top(j); + LayerParameter* split_layer_param = param_split->add_layer(); + const float loss_weight = top_idx_to_loss_weight[top_idx]; + ConfigureSplitLayer(layer_name, blob_name, j, split_count, + loss_weight, split_layer_param); + if (loss_weight) { + layer_param->clear_loss_weight(); + top_idx_to_bottom_split_idx[top_idx]++; + } + } + } + } } void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param) { - split_layer_param->Clear(); - split_layer_param->add_bottom(blob_name); - split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx)); - split_layer_param->set_type("Split"); - for (int k = 0; k < split_count; ++k) { - split_layer_param->add_top( - SplitBlobName(layer_name, blob_name, blob_idx, k)); - if (loss_weight) { - if (k == 0) { - split_layer_param->add_loss_weight(loss_weight); - } else { - split_layer_param->add_loss_weight(0); - } - } - } + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param) { + split_layer_param->Clear(); + split_layer_param->add_bottom(blob_name); + split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx)); + split_layer_param->set_type("Split"); + for (int k = 0; k < split_count; ++k) { + split_layer_param->add_top( + SplitBlobName(layer_name, blob_name, blob_idx, k)); + if (loss_weight) { + if (k == 0) { + split_layer_param->add_loss_weight(loss_weight); + } else { + split_layer_param->add_loss_weight(0); + } + } + } } string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx) { - ostringstream split_layer_name; - split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split"; - return split_layer_name.str(); + const int blob_idx) { + ostringstream split_layer_name; + split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx + << "_split"; + return split_layer_name.str(); } string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx) { - ostringstream split_blob_name; - split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split_" << split_idx; - return split_blob_name.str(); + const int blob_idx, const int split_idx) { + ostringstream split_blob_name; + split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx + << "_split_" << split_idx; + return split_blob_name.str(); } } // namespace caffe diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index 77ef7f25..c3be8a76 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -30,277 +30,277 @@ using google::protobuf::io::CodedOutputStream; using google::protobuf::Message; bool ReadProtoFromTextFile(const char* filename, Message* proto) { - int fd = open(filename, O_RDONLY); - CHECK_NE(fd, -1) << "File not found: " << filename; - FileInputStream* input = new FileInputStream(fd); - bool success = google::protobuf::TextFormat::Parse(input, proto); - delete input; - close(fd); - return success; + int fd = open(filename, O_RDONLY); + CHECK_NE(fd, -1) << "File not found: " << filename; + FileInputStream* input = new FileInputStream(fd); + bool success = google::protobuf::TextFormat::Parse(input, proto); + delete input; + close(fd); + return success; } void WriteProtoToTextFile(const Message& proto, const char* filename) { - int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); - FileOutputStream* output = new FileOutputStream(fd); - CHECK(google::protobuf::TextFormat::Print(proto, output)); - delete output; - close(fd); + int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + FileOutputStream* output = new FileOutputStream(fd); + CHECK(google::protobuf::TextFormat::Print(proto, output)); + delete output; + close(fd); } bool ReadProtoFromBinaryFile(const char* filename, Message* proto) { - int fd = open(filename, O_RDONLY); - CHECK_NE(fd, -1) << "File not found: " << filename; - ZeroCopyInputStream* raw_input = new FileInputStream(fd); - CodedInputStream* coded_input = new CodedInputStream(raw_input); - coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912); + int fd = open(filename, O_RDONLY); + CHECK_NE(fd, -1) << "File not found: " << filename; + ZeroCopyInputStream* raw_input = new FileInputStream(fd); + CodedInputStream* coded_input = new CodedInputStream(raw_input); + coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912); - bool success = proto->ParseFromCodedStream(coded_input); + bool success = proto->ParseFromCodedStream(coded_input); - delete coded_input; - delete raw_input; - close(fd); - return success; + delete coded_input; + delete raw_input; + close(fd); + return success; } void WriteProtoToBinaryFile(const Message& proto, const char* filename) { - fstream output(filename, ios::out | ios::trunc | ios::binary); - CHECK(proto.SerializeToOstream(&output)); + fstream output(filename, ios::out | ios::trunc | ios::binary); + CHECK(proto.SerializeToOstream(&output)); } cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color) { - cv::Mat cv_img; - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); - cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag); - if (!cv_img_origin.data) { - LOG(ERROR) << "Could not open or find file " << filename; - return cv_img_origin; - } - if (height > 0 && width > 0) { - cv::resize(cv_img_origin, cv_img, cv::Size(width, height)); - } else { - cv_img = cv_img_origin; - } - return cv_img; + const int height, const int width, const bool is_color) { + cv::Mat cv_img; + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : + CV_LOAD_IMAGE_GRAYSCALE); + cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag); + if (!cv_img_origin.data) { + LOG(ERROR) << "Could not open or find file " << filename; + return cv_img_origin; + } + if (height > 0 && width > 0) { + cv::resize(cv_img_origin, cv_img, cv::Size(width, height)); + } else { + cv_img = cv_img_origin; + } + return cv_img; } cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width) { - return ReadImageToCVMat(filename, height, width, true); + const int height, const int width) { + return ReadImageToCVMat(filename, height, width, true); } cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color) { - return ReadImageToCVMat(filename, 0, 0, is_color); + const bool is_color) { + return ReadImageToCVMat(filename, 0, 0, is_color); } cv::Mat ReadImageToCVMat(const string& filename) { - return ReadImageToCVMat(filename, 0, 0, true); + return ReadImageToCVMat(filename, 0, 0, true); } // Do the file extension and encoding match? static bool matchExt(const std::string & fn, - std::string en) { - size_t p = fn.rfind('.'); - std::string ext = p != fn.npos ? fn.substr(p) : fn; - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - std::transform(en.begin(), en.end(), en.begin(), ::tolower); - if ( ext == en ) - return true; - if ( en == "jpg" && ext == "jpeg" ) - return true; - return false; + std::string en) { + size_t p = fn.rfind('.'); + std::string ext = p != fn.npos ? fn.substr(p) : fn; + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + std::transform(en.begin(), en.end(), en.begin(), ::tolower); + if (ext == en) + return true; + if (en == "jpg" && ext == "jpeg") + return true; + return false; } bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum) { - cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); - if (cv_img.data) { - if (encoding.size()) { - if ( (cv_img.channels() == 3) == is_color && !height && !width && - matchExt(filename, encoding) ) - return ReadFileToDatum(filename, label, datum); - std::vector buf; - cv::imencode("."+encoding, cv_img, buf); - datum->set_data(std::string(reinterpret_cast(&buf[0]), - buf.size())); - datum->set_label(label); - datum->set_encoded(true); - return true; - } - CVMatToDatum(cv_img, datum); - datum->set_label(label); - return true; - } else { - return false; - } + const int height, const int width, const bool is_color, + const std::string & encoding, Datum* datum) { + cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); + if (cv_img.data) { + if (encoding.size()) { + if ((cv_img.channels() == 3) == is_color && !height && !width && + matchExt(filename, encoding)) + return ReadFileToDatum(filename, label, datum); + std::vector < uchar > buf; + cv::imencode("." + encoding, cv_img, buf); + datum->set_data(std::string(reinterpret_cast(&buf[0]), + buf.size())); + datum->set_label(label); + datum->set_encoded(true); + return true; + } + CVMatToDatum(cv_img, datum); + datum->set_label(label); + return true; + } else { + return false; + } } bool ReadFileToDatum(const string& filename, const int label, - Datum* datum) { - std::streampos size; + Datum* datum) { + std::streampos size; - fstream file(filename.c_str(), ios::in|ios::binary|ios::ate); - if (file.is_open()) { - size = file.tellg(); - std::string buffer(size, ' '); - file.seekg(0, ios::beg); - file.read(&buffer[0], size); - file.close(); - datum->set_data(buffer); - datum->set_label(label); - datum->set_encoded(true); - return true; - } else { - return false; - } + fstream file(filename.c_str(), ios::in | ios::binary | ios::ate); + if (file.is_open()) { + size = file.tellg(); + std::string buffer(size, ' '); + file.seekg(0, ios::beg); + file.read(&buffer[0], size); + file.close(); + datum->set_data(buffer); + datum->set_label(label); + datum->set_encoded(true); + return true; + } else { + return false; + } } cv::Mat DecodeDatumToCVMatNative(const Datum& datum) { - cv::Mat cv_img; - CHECK(datum.encoded()) << "Datum not encoded"; - const string& data = datum.data(); - std::vector vec_data(data.c_str(), data.c_str() + data.size()); - cv_img = cv::imdecode(vec_data, -1); - if (!cv_img.data) { - LOG(ERROR) << "Could not decode datum "; - } - return cv_img; + cv::Mat cv_img; + CHECK(datum.encoded()) << "Datum not encoded"; + const string& data = datum.data(); + std::vector vec_data(data.c_str(), data.c_str() + data.size()); + cv_img = cv::imdecode(vec_data, -1); + if (!cv_img.data) { + LOG(ERROR) << "Could not decode datum "; + } + return cv_img; } cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) { - cv::Mat cv_img; - CHECK(datum.encoded()) << "Datum not encoded"; - const string& data = datum.data(); - std::vector vec_data(data.c_str(), data.c_str() + data.size()); - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); - cv_img = cv::imdecode(vec_data, cv_read_flag); - if (!cv_img.data) { - LOG(ERROR) << "Could not decode datum "; - } - return cv_img; + cv::Mat cv_img; + CHECK(datum.encoded()) << "Datum not encoded"; + const string& data = datum.data(); + std::vector vec_data(data.c_str(), data.c_str() + data.size()); + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : + CV_LOAD_IMAGE_GRAYSCALE); + cv_img = cv::imdecode(vec_data, cv_read_flag); + if (!cv_img.data) { + LOG(ERROR) << "Could not decode datum "; + } + return cv_img; } // If Datum is encoded will decoded using DecodeDatumToCVMat and CVMatToDatum // If Datum is not encoded will do nothing bool DecodeDatumNative(Datum* datum) { - if (datum->encoded()) { - cv::Mat cv_img = DecodeDatumToCVMatNative((*datum)); - CVMatToDatum(cv_img, datum); - return true; - } else { - return false; - } + if (datum->encoded()) { + cv::Mat cv_img = DecodeDatumToCVMatNative((*datum)); + CVMatToDatum(cv_img, datum); + return true; + } else { + return false; + } } bool DecodeDatum(Datum* datum, bool is_color) { - if (datum->encoded()) { - cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color); - CVMatToDatum(cv_img, datum); - return true; - } else { - return false; - } + if (datum->encoded()) { + cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color); + CVMatToDatum(cv_img, datum); + return true; + } else { + return false; + } } void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { - CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; - datum->set_channels(cv_img.channels()); - datum->set_height(cv_img.rows); - datum->set_width(cv_img.cols); - datum->clear_data(); - datum->clear_float_data(); - datum->set_encoded(false); - int datum_channels = datum->channels(); - int datum_height = datum->height(); - int datum_width = datum->width(); - int datum_size = datum_channels * datum_height * datum_width; - std::string buffer(datum_size, ' '); - for (int h = 0; h < datum_height; ++h) { - const uchar* ptr = cv_img.ptr(h); - int img_index = 0; - for (int w = 0; w < datum_width; ++w) { - for (int c = 0; c < datum_channels; ++c) { - int datum_index = (c * datum_height + h) * datum_width + w; - buffer[datum_index] = static_cast(ptr[img_index++]); - } - } - } - datum->set_data(buffer); + CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; + datum->set_channels(cv_img.channels()); + datum->set_height(cv_img.rows); + datum->set_width(cv_img.cols); + datum->clear_data(); + datum->clear_float_data(); + datum->set_encoded(false); + int datum_channels = datum->channels(); + int datum_height = datum->height(); + int datum_width = datum->width(); + int datum_size = datum_channels * datum_height * datum_width; + std::string buffer(datum_size, ' '); + for (int h = 0; h < datum_height; ++h) { + const uchar* ptr = cv_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < datum_width; ++w) { + for (int c = 0; c < datum_channels; ++c) { + int datum_index = (c * datum_height + h) * datum_width + w; + buffer[datum_index] = static_cast(ptr[img_index++]); + } + } + } + datum->set_data(buffer); } // Verifies format of data stored in HDF5 file and reshapes blob accordingly. -template +template void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob) { - // Verify that the dataset exists. - CHECK(H5LTfind_dataset(file_id, dataset_name_)) - << "Failed to find HDF5 dataset " << dataset_name_; - // Verify that the number of dimensions is in the accepted range. - herr_t status; - int ndims; - status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims); - CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_; - CHECK_GE(ndims, min_dim); - CHECK_LE(ndims, max_dim); + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob) { + // Verify that the dataset exists. + CHECK(H5LTfind_dataset(file_id, dataset_name_)) + << "Failed to find HDF5 dataset " << dataset_name_; + // Verify that the number of dimensions is in the accepted range. + herr_t status; + int ndims; + status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims); + CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_; + CHECK_GE(ndims, min_dim); + CHECK_LE(ndims, max_dim); - // Verify that the data format is what we expect: float or double. - std::vector dims(ndims); - H5T_class_t class_; - status = H5LTget_dataset_info( - file_id, dataset_name_, dims.data(), &class_, NULL); - CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; - CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; + // Verify that the data format is what we expect: float or double. + std::vector < hsize_t > dims(ndims); + H5T_class_t class_; + status = H5LTget_dataset_info( + file_id, dataset_name_, dims.data(), &class_, NULL); + CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; + CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; - vector blob_dims(dims.size()); - for (int i = 0; i < dims.size(); ++i) { - blob_dims[i] = dims[i]; - } - blob->Reshape(blob_dims); + vector blob_dims(dims.size()); + for (int i = 0; i < dims.size(); ++i) { + blob_dims[i] = dims[i]; + } + blob->Reshape(blob_dims); } -template <> +template<> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { - hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_float( - file_id, dataset_name_, blob->mutable_cpu_data()); - CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; + int min_dim, int max_dim, Blob* blob) { + hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); + herr_t status = H5LTread_dataset_float( + file_id, dataset_name_, blob->mutable_cpu_data()); + CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; } -template <> +template<> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { - hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_double( - file_id, dataset_name_, blob->mutable_cpu_data()); - CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; + int min_dim, int max_dim, Blob* blob) { + hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); + herr_t status = H5LTread_dataset_double( + file_id, dataset_name_, blob->mutable_cpu_data()); + CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; } -template <> +template<> void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; + const hid_t file_id, const string& dataset_name, const Blob& blob) { + hsize_t dims[HDF5_NUM_DIMS]; + dims[0] = blob.num(); + dims[1] = blob.channels(); + dims[2] = blob.height(); + dims[3] = blob.width(); + herr_t status = H5LTmake_dataset_float( + file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; } -template <> +template<> void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; + const hid_t file_id, const string& dataset_name, const Blob& blob) { + hsize_t dims[HDF5_NUM_DIMS]; + dims[0] = blob.num(); + dims[1] = blob.channels(); + dims[2] = blob.height(); + dims[3] = blob.width(); + herr_t status = H5LTmake_dataset_double( + file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; } } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 80843191..61162be6 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -42,426 +42,493 @@ namespace caffe { template<> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); } template<> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); } -template <> +template<> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); - CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0, + ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template<> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0, + ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template<> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { - cl_event event; - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) ); - return event; -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, + offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); + return event; +} + +template<> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { - cl_event event; - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) ); - return event; -} - - -template <> -cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) { - cl_event event; - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); - return event; - } - -template <> -cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) { - cl_event event; - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) ); - return event; -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, + offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); + return event; +} + +template<> +cl_event caffe_gpu_gemm(cl_command_queue *queue, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, + offC, ldc, 1, queue, 0, NULL, &event)); + return event; +} + +template<> +cl_event caffe_gpu_gemm(cl_command_queue *queue, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, + offC, ldc, 1, queue, 0, NULL, &event)); + return event; +} + +template<> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -template <> +template<> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -template <> +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, size_t offA, int lda, - const float* x, size_t offx, const float beta, int incx, - float* y, size_t offy, int incy) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, - M, N, (cl_float)alpha, (cl_mem)A, offA, lda, - (cl_mem)x, offx, incx, (cl_float)beta, - (cl_mem)y, offy, incy, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); -} - -template <> + const int N, const float alpha, const float* A, size_t offA, int lda, + const float* x, size_t offx, const float beta, int incx, + float* y, size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, + M, N, (cl_float) alpha, (cl_mem) A, offA, lda, + (cl_mem) x, offx, incx, (cl_float) beta, + (cl_mem) y, offy, incy, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, size_t offA, int lda, - const double* x, size_t offx, const double beta, int incx, - double* y, size_t offy, int incy) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + const int N, const double alpha, const double* A, size_t offA, int lda, + const double* x, size_t offx, const double beta, int incx, + double* y, size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } - -template <> +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, - M, N, (cl_float)alpha, (cl_mem)A, 0, N, - (cl_mem)x, 0, 1, (cl_float)beta, - (cl_mem)y, 0, 1, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, + M, N, (cl_float) alpha, (cl_mem) A, 0, N, + (cl_mem) x, 0, 1, (cl_float) beta, + (cl_mem) y, 0, 1, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template <> +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, 0, N, (cl_mem)x, 0, 1, (cl_double)beta, (cl_mem)y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template <> +template<> void caffe_axpy(const int N, const float alpha, const float* X, - float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } + float* Y) { + cblas_saxpy(N, alpha, X, 1, Y, 1); +} -template <> +template<> void caffe_axpy(const int N, const double alpha, const double* X, - double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } + double* Y) { + cblas_daxpy(N, alpha, X, 1, Y, 1); +} -template <> +template<> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { - CLBLAS_CHECK( clblasSaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) ); + float* Y) { + CLBLAS_CHECK( + clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template <> +template<> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { - CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) ); + double* Y) { + CLBLAS_CHECK( + clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } template<> void caffe_gpu_sgnbit(const int n, const float* x, float* y) -{ + { } template<> void caffe_gpu_sgnbit(const int n, const double* x, double* y) -{ + { } template<> void caffe_gpu_abs(const int n, const float* x, float* y) -{ - caffe_gpu_abs_ocl(n, x, y); + { + caffe_gpu_abs_ocl(n, x, y); } template<> void caffe_gpu_abs(const int n, const double* x, double* y) -{ - caffe_gpu_abs_ocl(n, x, y); + { + caffe_gpu_abs_ocl(n, x, y); } -template <> +template<> void caffe_set(const int N, const float alpha, float* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(float) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(float) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } -template <> +template<> void caffe_set(const int N, const double alpha, double* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(double) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(double) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } -template <> +template<> void caffe_add_scalar(const int N, const float alpha, float* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } } -template <> +template<> void caffe_add_scalar(const int N, const double alpha, double* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } } -template <> +template<> void caffe_copy(const int N, const float* X, float* Y) { - cblas_scopy(N, X, 1, Y, 1); + cblas_scopy(N, X, 1, Y, 1); } -template <> +template<> void caffe_copy(const int N, const double* X, double* Y) { - cblas_dcopy(N, X, 1, Y, 1); + cblas_dcopy(N, X, 1, Y, 1); } //template void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) -{ - clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)X, CL_TRUE, 0, N, Y,0, NULL, NULL); + { + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, + NULL, NULL); // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); } /* -template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); -template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); -template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); -template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); -*/ -template<> + template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); + template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); + template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); + template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); + */ +template<> void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) -{ OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); + { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N, + 0, NULL, NULL)); } -template<> +template<> void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) -{ OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); + { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N, + 0, NULL, NULL)); } -template <> +template<> void caffe_gpu_copy(const int N, const float* X, float* Y) { - if(X != Y){ - CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); - } + if (X != Y) { + CLBLAS_CHECK( + clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } } -template <> +template<> void caffe_gpu_copy(const int N, const double* X, double* Y) { - if(X != Y){ - CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); - } + if (X != Y) { + CLBLAS_CHECK( + clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } } -template <> +template<> void caffe_scal(const int N, const float alpha, float *X) { - cblas_sscal(N, alpha, X, 1); + cblas_sscal(N, alpha, X, 1); } -template <> +template<> void caffe_scal(const int N, const double alpha, double *X) { - cblas_dscal(N, alpha, X, 1); + cblas_dscal(N, alpha, X, 1); } -template <> +template<> void caffe_gpu_scal(const int N, const float alpha, float *X) { - CLBLAS_CHECK(clblasSscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + CLBLAS_CHECK( + clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } -template <> +template<> void caffe_gpu_scal(const int N, const double alpha, double *X) { - CLBLAS_CHECK(clblasDscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + CLBLAS_CHECK( + clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } -template <> +template<> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } -template <> +template<> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } -template <> +template<> void caffe_cpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - cblas_saxpby(N, alpha, X, 1, beta, Y, 1); + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); } -template <> +template<> void caffe_cpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - cblas_daxpby(N, alpha, X, 1, beta, Y, 1); + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); } -template <> +template<> void caffe_add(const int n, const float* a, const float* b, - float* y) { - vsAdd(n, a, b, y); + float* y) { + vsAdd(n, a, b, y); } -template <> +template<> void caffe_add(const int n, const double* a, const double* b, - double* y) { - vdAdd(n, a, b, y); + double* y) { + vdAdd(n, a, b, y); } -template <> +template<> void caffe_sub(const int n, const float* a, const float* b, - float* y) { - vsSub(n, a, b, y); + float* y) { + vsSub(n, a, b, y); } -template <> +template<> void caffe_sub(const int n, const double* a, const double* b, - double* y) { - vdSub(n, a, b, y); + double* y) { + vdSub(n, a, b, y); } -template <> +template<> void caffe_mul(const int n, const float* a, const float* b, - float* y) { - vsMul(n, a, b, y); + float* y) { + vsMul(n, a, b, y); } -template <> +template<> void caffe_mul(const int n, const double* a, const double* b, - double* y) { - vdMul(n, a, b, y); + double* y) { + vdMul(n, a, b, y); } -template <> +template<> void caffe_div(const int n, const float* a, const float* b, - float* y) { - vsDiv(n, a, b, y); + float* y) { + vsDiv(n, a, b, y); } -template <> +template<> void caffe_div(const int n, const double* a, const double* b, - double* y) { - vdDiv(n, a, b, y); + double* y) { + vdDiv(n, a, b, y); } -template <> +template<> void caffe_powx(const int n, const float* a, const float b, - float* y) { - vsPowx(n, a, b, y); + float* y) { + vsPowx(n, a, b, y); } -template <> +template<> void caffe_powx(const int n, const double* a, const double b, - double* y) { - vdPowx(n, a, b, y); + double* y) { + vdPowx(n, a, b, y); } -template <> +template<> void caffe_sqr(const int n, const float* a, float* y) { - vsSqr(n, a, y); + vsSqr(n, a, y); } -template <> +template<> void caffe_sqr(const int n, const double* a, double* y) { - vdSqr(n, a, y); + vdSqr(n, a, y); } -template <> +template<> void caffe_exp(const int n, const float* a, float* y) { - vsExp(n, a, y); + vsExp(n, a, y); } -template <> +template<> void caffe_exp(const int n, const double* a, double* y) { - vdExp(n, a, y); + vdExp(n, a, y); } unsigned int caffe_rng_rand() { - return (*caffe_rng())(); + return (*caffe_rng())(); } -template +template Dtype caffe_nextafter(const Dtype b) { - return boost::math::nextafter( - b, std::numeric_limits::max()); + return boost::math::nextafter < Dtype > ( + b, std::numeric_limits < Dtype > ::max()); } template @@ -470,65 +537,66 @@ float caffe_nextafter(const float b); template double caffe_nextafter(const double b); -template +template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_LE(a, b); - boost::uniform_real random_distribution(a, caffe_nextafter(b)); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_LE(a, b); + boost::uniform_real < Dtype + > random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } - //LOG(INFO) << "caffe_rng_uniform"; + //LOG(INFO) << "caffe_rng_uniform"; } template void caffe_rng_uniform(const int n, const float a, const float b, - float* r); + float* r); template void caffe_rng_uniform(const int n, const double a, const double b, - double* r); + double* r); -template +template void caffe_rng_gaussian(const int n, const Dtype a, - const Dtype sigma, Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GT(sigma, 0); - boost::normal_distribution random_distribution(a, sigma); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - //variate_generator(37, random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } - //LOG(INFO) << "caffe_rng_guassian"; + const Dtype sigma, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GT(sigma, 0); + boost::normal_distribution < Dtype > random_distribution(a, sigma); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + //variate_generator(37, random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } + //LOG(INFO) << "caffe_rng_guassian"; } template void caffe_rng_gaussian(const int n, const float mu, - const float sigma, float* r); + const float sigma, float* r); template void caffe_rng_gaussian(const int n, const double mu, - const double sigma, double* r); + const double sigma, double* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } } template @@ -537,18 +605,18 @@ void caffe_rng_bernoulli(const int n, const double p, int* r); template void caffe_rng_bernoulli(const int n, const float p, int* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = static_cast(variate_generator()); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } } template @@ -557,365 +625,375 @@ void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); template void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); // -template <> +template<> float caffe_cpu_dot(const int n, const float* x, const float* y) { - return cblas_sdot(n, x, 1, y, 1); + return cblas_sdot(n, x, 1, y, 1); } -template <> +template<> double caffe_cpu_dot(const int n, const double* x, const double* y) { - return cblas_ddot(n, x, 1, y, 1); + return cblas_ddot(n, x, 1, y, 1); } -template <> +template<> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL); - cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL); - clblasSdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_out); + float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(float)), NULL, NULL); + clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); } -template <> +template<> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { - //need to pass in scratchBuff - //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(double)), NULL, NULL); - cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(double)), NULL, NULL); - clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), out,0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_out); -} - -template <> + double * out) { + //need to pass in scratchBuff + //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(double)), NULL, NULL); + clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template<> int caffe_cpu_hamming_distance(const int n, const float* x, - const float* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcount(static_cast(x[i]) ^ - static_cast(y[i])); - } - return dist; + const float* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcount(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; } -template <> +template<> int caffe_cpu_hamming_distance(const int n, const double* x, - const double* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcountl(static_cast(x[i]) ^ - static_cast(y[i])); - } - return dist; + const double* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcountl(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; } -template <> +template<> float caffe_cpu_asum(const int n, const float* x) { - return cblas_sasum(n, x, 1); + return cblas_sasum(n, x, 1); } -template <> +template<> double caffe_cpu_asum(const int n, const double* x) { - return cblas_dasum(n, x, 1); + return cblas_dasum(n, x, 1); } -template <> +template<> void caffe_gpu_asum(const int n, const float* x, float* y) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_float)), NULL, NULL); - cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_float)), NULL, NULL); - clblasSasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_y); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_float)), NULL, NULL); + clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, + 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } -template <> +template<> void caffe_gpu_asum(const int n, const double* x, double* y) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_double)), NULL, NULL); - cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_double)), NULL, NULL); - clblasDasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), y,0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_y); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_double)), NULL, NULL); + clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), + y, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - // - (x[index] < Dtype(0))); +// - (x[index] < Dtype(0))); //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); -INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign); -INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit); -INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); -template <> +template<> void caffe_cpu_scale(const int n, const float alpha, const float *x, - float* y) { - cblas_scopy(n, x, 1, y, 1); - cblas_sscal(n, alpha, y, 1); + float* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); } -template <> +template<> void caffe_cpu_scale(const int n, const double alpha, const double *x, - double* y) { - cblas_dcopy(n, x, 1, y, 1); - cblas_dscal(n, alpha, y, 1); + double* y) { + cblas_dcopy(n, x, 1, y, 1); + cblas_dscal(n, alpha, y, 1); } -template <> +template<> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { - caffe_gpu_copy(n, x, y); - caffe_gpu_scal(n, alpha, y); + float* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } -template <> +template<> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { - caffe_gpu_copy(n, x, y); - caffe_gpu_scal(n, alpha, y); + double* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } -template +template void set_kernel(const int n, const Dtype alpha, Dtype* y) { } -template <> +template<> void caffe_gpu_set(const int N, const float alpha, float* Y) { - ocl_memset(Y, alpha, N); + ocl_memset(Y, alpha, N); } -template <> +template<> void caffe_gpu_set(const int N, const double alpha, double* Y) { - ocl_memset(Y, alpha, N); + ocl_memset(Y, alpha, N); } -template <> +template<> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { - kernel_add_scalar(N, alpha, Y); + kernel_add_scalar(N, alpha, Y); } -template <> +template<> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { - kernel_add_scalar(N, alpha, Y); + kernel_add_scalar(N, alpha, Y); } -template <> +template<> void caffe_gpu_exp(const int N, const float* a, float* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } -template <> +template<> void caffe_gpu_exp(const int N, const double* a, double* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } template<> -void caffe_gpu_sign(const int N, const float *X, float *Y){ - caffe_gpu_sign_ocl(N, X, Y); +void caffe_gpu_sign(const int N, const float *X, float *Y) { + caffe_gpu_sign_ocl(N, X, Y); } template<> -void caffe_gpu_sign(const int N, const double *X, double *Y){ - caffe_gpu_sign_ocl(N, X, Y); +void caffe_gpu_sign(const int N, const double *X, double *Y) { + caffe_gpu_sign_ocl(N, X, Y); } -template <> +template<> void caffe_gpu_sub(const int N, const float* a, const float* b, - float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_sub(N, a, b, y); + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); } -template <> +template<> void caffe_gpu_sub(const int N, const double* a, const double* b, - double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_sub(N, a, b, y); + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); } -template <> +template<> void caffe_gpu_mul(const int N, const float* a, - const float* b, float* y) { - kernel_mul(N, a, b, y); + const float* b, float* y) { + kernel_mul(N, a, b, y); } -template <> +template<> void caffe_gpu_mul(const int N, const double* a, - const double* b, double* y) { - kernel_mul(N, a, b, y); + const double* b, double* y) { + kernel_mul(N, a, b, y); } -template <> +template<> void caffe_gpu_div(const int N, const float* a, - const float* b, float* y) { - kernel_div(N, a, b, y); + const float* b, float* y) { + kernel_div(N, a, b, y); } -template <> +template<> void caffe_gpu_div(const int N, const double* a, - const double* b, double* y) { - kernel_div(N, a, b, y); + const double* b, double* y) { + kernel_div(N, a, b, y); } -template <> +template<> void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_powx(N, a, alpha, y); + const float alpha, float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); } -template <> +template<> void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_powx(N, a, alpha, y); + const double alpha, double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); } void popc_kernel(const int n, const float* a, - const float* b, uint8_t* y) { + const float* b, uint8_t* y) { } void popcll_kernel(const int n, const double* a, - const double* b, uint8_t* y) { + const double* b, uint8_t* y) { } -template <> +template<> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, - const float* y) { + const float* y) { return 0; } -template <> +template<> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, - const double* y) { + const double* y) { return 0; } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { } -template <> +template<> void caffe_gpu_rng_uniform(const int n, const float a, const float b, - float* r) { + float* r) { } -template <> +template<> void caffe_gpu_rng_uniform(const int n, const double a, const double b, - double* r) { + double* r) { } -template <> +template<> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, - float* r) { + float* r) { } -template <> +template<> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, - double* r) { + double* r) { } -template <> +template<> void caffe_gpu_log(const int N, const float* a, float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_log(N, a, y); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); } -template <> +template<> void caffe_gpu_log(const int N, const double* a, double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_log(N, a, y); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); } - - - - - -template <> +template<> void caffe_log(const int n, const float* a, float* y) { - vsLn(n, a, y); + vsLn(n, a, y); } -template <> +template<> void caffe_log(const int n, const double* a, double* y) { - vdLn(n, a, y); + vdLn(n, a, y); } -template +template void caffe_copy(const int N, const Dtype* X, Dtype* Y) { - if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); + // NOLINT_NEXT_LINE(caffe/alt_fn) + //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); #else - NO_GPU; + NO_GPU; #endif - } else { - memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } - } + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } } template void caffe_copy(const int N, const int* X, int* Y); template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); + unsigned int* Y); template void caffe_copy(const int N, const float* X, float* Y); template void caffe_copy(const int N, const double* X, double* Y); -template <> +template<> void caffe_abs(const int n, const float* a, float* y) { - vsAbs(n, a, y); + vsAbs(n, a, y); } -template <> +template<> void caffe_abs(const int n, const double* a, double* y) { - vdAbs(n, a, y); + vdAbs(n, a, y); } -template <> +template<> void caffe_gpu_add(const int N, const float* a, const float* b, - float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_add(N, a, b, y); + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); } -template <> +template<> void caffe_gpu_add(const int N, const double* a, const double* b, - double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_add(N, a, b, y); + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); } -template <> +template<> float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { - return cblas_sdot(n, x, incx, y, incy); + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); } -template <> +template<> double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { - return cblas_ddot(n, x, incx, y, incy); + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); } -template +template void caffe_set(const int N, const Dtype alpha, Dtype* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } template void caffe_set(const int N, const int alpha, int* Y); template void caffe_set(const int N, const float alpha, float* Y); template void caffe_set(const int N, const double alpha, double* Y); - } // namespace caffe diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu index 2631a074..1bf783e4 100644 --- a/src/caffe/util/math_functions.cu +++ b/src/caffe/util/math_functions.cu @@ -12,152 +12,152 @@ namespace caffe { -template <> +template<> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +template<> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); -} - -template <> + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, + A, N, x, 1, &beta, y, 1)); } -template <> +template<> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, + A, N, x, 1, &beta, y, 1)); } -template <> +template<> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { - CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); + float* Y) { + CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } -template <> +template<> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { - CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); + double* Y) { + CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) { - if (X != Y) { - CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) - } + if (X != Y) { + CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) + } } -template <> +template<> void caffe_gpu_scal(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } -template <> +template<> void caffe_gpu_scal(const int N, const double alpha, double *X) { - CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } -template <> +template<> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } -template <> +template<> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } -template <> +template<> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { - CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); + float* out) { + CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } -template <> +template<> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { - CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); + double * out) { + CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } -template <> +template<> void caffe_gpu_asum(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); } -template <> +template<> void caffe_gpu_asum(const int n, const double* x, double* y) { - CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); } -template <> +template<> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { - CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); + float* y) { + CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } -template <> +template<> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { - CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); + double* y) { + CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = alpha; - } + CUDA_KERNEL_LOOP(index, n) { + y[index] = alpha; + } } -template +template void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { - if (alpha == 0) { - CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) - return; - } - // NOLINT_NEXT_LINE(whitespace/operators) - set_kernel<<>>( - N, alpha, Y); + if (alpha == 0) { + CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) + return; + } + // NOLINT_NEXT_LINE(whitespace/operators) +set_kernel<<>>( + N, alpha, Y); } template void caffe_gpu_set(const int N, const int alpha, int* Y); @@ -166,300 +166,301 @@ template void caffe_gpu_set(const int N, const double alpha, double* Y); template __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] += alpha; - } +CUDA_KERNEL_LOOP(index, n) { + y[index] += alpha; +} } -template <> +template<> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { - // NOLINT_NEXT_LINE(whitespace/operators) - add_scalar_kernel<<>>( - N, alpha, Y); +// NOLINT_NEXT_LINE(whitespace/operators) +add_scalar_kernel<<>>( +N, alpha, Y); } -template <> +template<> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_scalar_kernel<<>>( - N, alpha, Y); +add_scalar_kernel<<>>( +N, alpha, Y); } template __global__ void add_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] + b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] + b[index]; +} } -template <> +template<> void caffe_gpu_add(const int N, const float* a, const float* b, - float* y) { +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_kernel<<>>( - N, a, b, y); +add_kernel<<>>( +N, a, b, y); } -template <> +template<> void caffe_gpu_add(const int N, const double* a, const double* b, - double* y) { +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_kernel<<>>( - N, a, b, y); +add_kernel<<>>( +N, a, b, y); } template __global__ void sub_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] - b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] - b[index]; +} } -template <> +template<> void caffe_gpu_sub(const int N, const float* a, const float* b, - float* y) { +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - sub_kernel<<>>( - N, a, b, y); +sub_kernel<<>>( +N, a, b, y); } -template <> +template<> void caffe_gpu_sub(const int N, const double* a, const double* b, - double* y) { +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - sub_kernel<<>>( - N, a, b, y); +sub_kernel<<>>( +N, a, b, y); } template __global__ void mul_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] * b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] * b[index]; +} } -template <> +template<> void caffe_gpu_mul(const int N, const float* a, - const float* b, float* y) { +const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - mul_kernel<<>>( - N, a, b, y); +mul_kernel<<>>( +N, a, b, y); } -template <> +template<> void caffe_gpu_mul(const int N, const double* a, - const double* b, double* y) { +const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - mul_kernel<<>>( - N, a, b, y); +mul_kernel<<>>( +N, a, b, y); } template __global__ void div_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] / b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] / b[index]; +} } -template <> +template<> void caffe_gpu_div(const int N, const float* a, - const float* b, float* y) { +const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - div_kernel<<>>( - N, a, b, y); +div_kernel<<>>( +N, a, b, y); } -template <> +template<> void caffe_gpu_div(const int N, const double* a, - const double* b, double* y) { +const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - div_kernel<<>>( - N, a, b, y); +div_kernel<<>>( +N, a, b, y); } template __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = abs(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = abs(a[index]); +} } -template <> +template<> void caffe_gpu_abs(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - abs_kernel<<>>( - N, a, y); +abs_kernel<<>>( +N, a, y); } -template <> +template<> void caffe_gpu_abs(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - abs_kernel<<>>( - N, a, y); +abs_kernel<<>>( +N, a, y); } - template __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = exp(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = exp(a[index]); +} } -template <> +template<> void caffe_gpu_exp(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - exp_kernel<<>>( - N, a, y); +exp_kernel<<>>( +N, a, y); } -template <> +template<> void caffe_gpu_exp(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - exp_kernel<<>>( - N, a, y); +exp_kernel<<>>( +N, a, y); } template __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = log(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = log(a[index]); +} } -template <> +template<> void caffe_gpu_log(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - log_kernel<<>>( - N, a, y); +log_kernel<<>>( +N, a, y); } -template <> +template<> void caffe_gpu_log(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - log_kernel<<>>( - N, a, y); +log_kernel<<>>( +N, a, y); } template __global__ void powx_kernel(const int n, const Dtype* a, - const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = pow(a[index], alpha); - } +const Dtype alpha, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = pow(a[index], alpha); +} } -template <> +template<> void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { +const float alpha, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - powx_kernel<<>>( - N, a, alpha, y); +powx_kernel<<>>( +N, a, alpha, y); } -template <> +template<> void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { +const double alpha, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - powx_kernel<<>>( - N, a, alpha, y); +powx_kernel<<>>( +N, a, alpha, y); } DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - - (x[index] < Dtype(0))); +- (x[index] < Dtype(0))); DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); __global__ void popc_kernel(const int n, const float* a, - const float* b, uint8_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = __popc(static_cast(a[index]) ^ - static_cast(b[index])); - } +const float* b, uint8_t* y) { +CUDA_KERNEL_LOOP(index, n) +{ +y[index] = __popc(static_cast(a[index]) ^ +static_cast(b[index])); +} } __global__ void popcll_kernel(const int n, const double* a, - const double* b, uint8_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = __popcll(static_cast(a[index]) ^ - static_cast(b[index])); - } +const double* b, uint8_t* y) { +CUDA_KERNEL_LOOP(index, n) +{ +y[index] = __popcll(static_cast(a[index]) ^ +static_cast(b[index])); +} } -template <> +template<> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, - const float* y) { +const float* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test - // TestHammingDistanceGPU in test_math_functions.cpp). - NOT_IMPLEMENTED; - thrust::device_vector popcounts(n); + // TestHammingDistanceGPU in test_math_functions.cpp). +NOT_IMPLEMENTED; +thrust::device_vector < uint8_t > popcounts(n); // NOLINT_NEXT_LINE(whitespace/operators) - popc_kernel<<>>( - n, x, y, thrust::raw_pointer_cast(popcounts.data())); - return thrust::reduce(popcounts.begin(), popcounts.end(), - (uint32_t) 0, thrust::plus()); +popc_kernel<<>>( +n, x, y, thrust::raw_pointer_cast(popcounts.data())); +return thrust::reduce(popcounts.begin(), popcounts.end(), +(uint32_t) 0, thrust::plus()); } -template <> +template<> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, - const double* y) { +const double* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test - // TestHammingDistanceGPU in test_math_functions.cpp). - NOT_IMPLEMENTED; - thrust::device_vector popcounts(n); + // TestHammingDistanceGPU in test_math_functions.cpp). +NOT_IMPLEMENTED; +thrust::device_vector < uint8_t > popcounts(n); // NOLINT_NEXT_LINE(whitespace/operators) - popcll_kernel<<>>( - n, x, y, thrust::raw_pointer_cast(popcounts.data())); - return thrust::reduce(popcounts.begin(), popcounts.end(), - /* NOLINT_NEXT_LINE(build/include_what_you_use) */ - (uint32_t) 0, thrust::plus()); +popcll_kernel<<>>( +n, x, y, thrust::raw_pointer_cast(popcounts.data())); +return thrust::reduce(popcounts.begin(), popcounts.end(), +/* NOLINT_NEXT_LINE(build/include_what_you_use) */ +(uint32_t) 0, thrust::plus()); } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { - CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); +CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); } -template <> +template<> void caffe_gpu_rng_uniform(const int n, const float a, const float b, - float* r) { - CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); - const float range = b - a; - if (range != static_cast(1)) { - caffe_gpu_scal(n, range, r); - } - if (a != static_cast(0)) { - caffe_gpu_add_scalar(n, a, r); - } -} - -template <> +float* r) { +CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); +const float range = b - a; +if (range != static_cast(1)) { +caffe_gpu_scal(n, range, r); +} +if (a != static_cast(0)) { +caffe_gpu_add_scalar(n, a, r); +} +} + +template<> void caffe_gpu_rng_uniform(const int n, const double a, const double b, - double* r) { - CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); - const double range = b - a; - if (range != static_cast(1)) { - caffe_gpu_scal(n, range, r); - } - if (a != static_cast(0)) { - caffe_gpu_add_scalar(n, a, r); - } -} - -template <> +double* r) { +CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); +const double range = b - a; +if (range != static_cast(1)) { +caffe_gpu_scal(n, range, r); +} +if (a != static_cast(0)) { +caffe_gpu_add_scalar(n, a, r); +} +} + +template<> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, - float* r) { - CURAND_CHECK( - curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); +float* r) { +CURAND_CHECK( +curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); } -template <> +template<> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, - double* r) { - CURAND_CHECK( - curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); +double* r) { +CURAND_CHECK( +curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); } } // namespace caffe diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 7f9631e2..6b8c5fee 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -32,51 +32,60 @@ #include "caffe/common.hpp" #include "caffe/util/ocl_util.hpp" namespace caffe { -template extern std::string get_dtype_suffix(); +template extern std::string get_dtype_suffix(); -template -void ocl_memset(Dtype* buffer, const Dtype value, const int count){ - std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int err=0; - err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); - err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value); - err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); - OCL_CHECK(err); - - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +template +void ocl_memset(Dtype* buffer, const Dtype value, const int count) { + std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int err = 0; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + OCL_CHECK(err); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ocl_memset(int* buffer, const int value, const int count); -template void ocl_memset(float* buffer, const float value, const int count); -template void ocl_memset(double* buffer, const double value, const int count); - +template void ocl_memset(float* buffer, const float value, + const int count); +template void ocl_memset(double* buffer, const double value, + const int count); -void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){ - cl_int err; - err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer); - err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value); - err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count); - OCL_CHECK(err); +void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, + const int count) { + cl_int err; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + OCL_CHECK(err); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -void eventCallback(cl_event event, cl_int event_status, void* user_data){ - cl_ulong ev_start_time = (cl_ulong)0; - cl_ulong ev_end_time = (cl_ulong)0; - double run_time; - OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL) ); - OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL) ); - run_time = (double)(ev_end_time - ev_start_time); - printf("The kernel's running time is %f s\n", run_time * 1.0e-9); +void eventCallback(cl_event event, cl_int event_status, void* user_data) { + cl_ulong ev_start_time = (cl_ulong) 0; + cl_ulong ev_end_time = (cl_ulong) 0; + double run_time; + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &ev_start_time, NULL)); + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), + &ev_end_time, NULL)); + run_time = (double) (ev_end_time - ev_start_time); + printf("The kernel's running time is %f s\n", run_time * 1.0e-9); } - } // namespace caffe diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index c8f28426..f7cf9c07 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -34,1433 +34,1840 @@ #include "caffe/util/ocl_wrapper.hpp" namespace caffe { typedef unsigned int uint32_t; -struct array4x32 { uint32_t v[4]; }; -template -void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold) -{ - std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*)&threshold); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&nrounds); - ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*)&size); - OCL_CHECK(ret); - - size_t globalws[1] = {size}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); -} -template void caffe_gpu_bernoulli(int* a, const unsigned int n, float inf, float sup, float threshold); -template void caffe_gpu_bernoulli(int* a, const unsigned int n, double inf, double sup, double threshold); - - -template -void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ - std::string kernel_name = "transform" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size2[]={(size_t)(M_ * packing_num)}; - size_t uiLocal_Work_Size2[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) ); -} - -template void transform_gpu(float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num); -template void transform_gpu(double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num); - -template -void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){ - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) ); - - size_t Global_Work_Size[1] = {(size_t)num}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data); -template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data); - - -template -void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){ - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); - - size_t Global_Work_Size[1] = {(size_t)num}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void exp_gpu(cl_kernel Kernel, const int num, const float* data, float* out); -template void exp_gpu(cl_kernel Kernel, const int num, const double* data, double* out); - -template -void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data){ - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); - - size_t Global_Work_Size[1] = {(size_t) (num * dim)}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data); -template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data); - -template -Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss){ - - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&prob_data)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&d_loss)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&dim)); - OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); - - size_t globalws[1] = {256}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, localws, 0, NULL, NULL) ); - void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); - Dtype loss = *(Dtype*)h_loss; - clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, NULL); - - return loss; -} - -template float softmax_gpu(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss); -template double softmax_gpu(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss); - -template +struct array4x32 { + uint32_t v[4]; +}; +template +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, + Dtype threshold) + { + std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, + localws, 0, NULL, NULL)); +} +template void caffe_gpu_bernoulli(int* a, const unsigned int n, + float inf, float sup, float threshold); +template void caffe_gpu_bernoulli(int* a, const unsigned int n, + double inf, double sup, double threshold); + +template +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, + const int M_, const int packing_num) { + std::string kernel_name = "transform" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); +} + +template void transform_gpu(float* src, float* dst, const int top_offset, + const int N_, const int M_, const int packing_num); +template void transform_gpu(double* src, double* dst, + const int top_offset, const int N_, const int M_, const int packing_num); + +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* bottom_data, Dtype* scale_data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const float* bottom_data, float* scale_data); +template void get_max_gpu(cl_kernel Kernel, const int num, + const int dim, const double* bottom_data, double* scale_data); + +template +void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void exp_gpu(cl_kernel Kernel, const int num, const float* data, + float* out); +template void exp_gpu(cl_kernel Kernel, const int num, + const double* data, double* out); + +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* scale, Dtype* data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t)(num * dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void softmax_div_gpu(cl_kernel Kernel, const int num, + const int dim, const float* scale, float* data); +template void softmax_div_gpu(cl_kernel Kernel, const int num, + const int dim, const double* scale, double* data); + +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); + + size_t globalws[1] = { 256 }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, + localws, 0, NULL, NULL)); + void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, + CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); + Dtype loss = *(Dtype*) h_loss; + clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, + NULL); + + return loss; +} + +template float softmax_gpu(cl_kernel Kernel, const int num, + const int dim, const float* prob_data, const float* label, cl_mem d_loss); +template double softmax_gpu(cl_kernel Kernel, const int num, + const int dim, const double* prob_data, const double* label, cl_mem d_loss); + +template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) -{ - std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) ); - - size_t Global_Work_Size[1] = {(size_t) (num*spatial_dim)}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_channel_max( const int num, const int channels, - const int spatial_dim, const float* data, float* out); -template void kernel_channel_max( const int num, const int channels, - const int spatial_dim, const double* data, double* out); - -template -void kernel_channel_subtract( const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) -{ - std::string kernel_name = "kernel_channel_subtract" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) ); - OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); - - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_channel_subtract( const int count, - const int num, const int channels, - const int spatial_dim, const float* channel_max, float* data); -template void kernel_channel_subtract( const int count, - const int num, const int channels, - const int spatial_dim, const double* channel_max, double* data); - -template + const int spatial_dim, const Dtype* data, Dtype* out) + { + std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const float* data, float* out); +template void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const double* data, double* out); + +template +void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data) + { + std::string kernel_name = "kernel_channel_subtract" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, const float* channel_max, float* data); +template void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, const double* channel_max, double* data); + +template void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) -{ - std::string kernel_name = "kernel_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void kernel_mul(const int count, const float* a, const float* b, float* out); -template void kernel_mul(const int count, const double* a, const double* b, double* out); +template void kernel_mul(const int count, const float* a, const float* b, + float* out); +template void kernel_mul(const int count, const double* a, + const double* b, double* out); -template +template void kernel_add_scalar(const int count, const Dtype data, Dtype* out) -{ - std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); - - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_add_scalar(const int count, const float data, float* out); -template void kernel_add_scalar(const int count, const double data, double* out); - - -template -void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out) -{ - std::string kernel_name = "kernel_powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); - - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_powx(const int count, const float* data, const float alpha, float* out); -template void kernel_powx(const int count, const double* data, const double alpha, double* out); - -template + { + std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_add_scalar(const int count, const float data, + float* out); +template void kernel_add_scalar(const int count, const double data, + double* out); + +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, + Dtype* out) + { + std::string kernel_name = "kernel_powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_powx(const int count, const float* data, + const float alpha, float* out); +template void kernel_powx(const int count, const double* data, + const double alpha, double* out); + +template void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) -{ - std::string kernel_name = "kernel_div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void kernel_div(const int count, const float* a, const float* b, float* out); -template void kernel_div(const int count, const double* a, const double* b, double* out); +template void kernel_div(const int count, const float* a, const float* b, + float* out); +template void kernel_div(const int count, const double* a, + const double* b, double* out); -template +template void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) -{ - std::string kernel_name = "kernel_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void kernel_add(const int count, const float* a, const float* b, float* out); -template void kernel_add(const int count, const double* a, const double* b, double* out); +template void kernel_add(const int count, const float* a, const float* b, + float* out); +template void kernel_add(const int count, const double* a, + const double* b, double* out); -template +template void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) -{ - std::string kernel_name = "kernel_sub" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_sub" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void kernel_sub(const int count, const float* a, const float* b, float* out); -template void kernel_sub(const int count, const double* a, const double* b, double* out); +template void kernel_sub(const int count, const float* a, const float* b, + float* out); +template void kernel_sub(const int count, const double* a, + const double* b, double* out); -template +template void kernel_log(const int count, const Dtype* data, Dtype* out) -{ - std::string kernel_name = "kernel_log" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_log" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_log(const int count, const float* data, float* out); -template void kernel_log(const int count, const double* data, double* out); - +template void kernel_log(const int count, const double* data, + double* out); -template +template void kernel_exp(const int count, const Dtype* data, Dtype* out) -{ - std::string kernel_name = "kernel_exp" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + { + std::string kernel_name = "kernel_exp" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) ); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_exp(const int count, const float* data, float* out); -template void kernel_exp(const int count, const double* data, double* out); +template void kernel_exp(const int count, const double* data, + double* out); -template +template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) -{ - std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); - - size_t Global_Work_Size[1] = {(size_t)(num*channels)}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum); -template void kernel_channel_sum(const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum); - -template + const int spatial_dim, const Dtype* data, Dtype* channel_sum) + { + std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + + size_t Global_Work_Size[1] = { (size_t)(num * channels) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const float* data, float* channel_sum); +template void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const double* data, double* channel_sum); + +template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) -{ - std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) ); - OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) ); - - size_t Global_Work_Size[1] = {(size_t)count}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const float* channel_sum, float* data); -template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const double* channel_sum, double* data); - -template + const int spatial_dim, const Dtype* channel_sum, Dtype* data) + { + std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_div(const int count, const int num, + const int channels, + const int spatial_dim, const float* channel_sum, float* data); +template void kernel_channel_div(const int count, const int num, + const int channels, + const int spatial_dim, const double* channel_sum, double* data); + +template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) -{ - std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data_1) ); - OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) ); - OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) ); - - size_t Global_Work_Size[1] = {(size_t)(num*spatial_dim)}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot) + { + std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot)); + + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot); + const int spatial_dim, const float* data_1, const float* data_2, + float* channel_dot); template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot); - + const int spatial_dim, const double* data_1, const double* data_2, + double* channel_dot); -template +template void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) -{ - std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&prob_data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&loss)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&dim)); - OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*)&has_ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); - - size_t Global_Work_Size[1] = {(size_t)nthreads}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void SoftmaxLossForwardGPU(const int nthreads, const float* prob_data, const float* label, float* loss, - const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts); -template void SoftmaxLossForwardGPU(const int nthreads, const double* prob_data, const double* label, double* loss, - const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts); - -template + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts) + { + std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossForwardGPU(const int nthreads, + const float* prob_data, const float* label, float* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, float* counts); +template void SoftmaxLossForwardGPU(const int nthreads, + const double* prob_data, const double* label, double* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, double* counts); + +template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) -{ - std::string kernel_name = "SoftmaxLossBackwardGPU" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&dim)); - OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*)&has_ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*)&counts)); - - size_t Global_Work_Size[1] = {(size_t)nthreads}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void SoftmaxLossBackwardGPU(const int nthreads, const float* top, const float* label, float* bottom_diff, - const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts); -template void SoftmaxLossBackwardGPU(const int nthreads, const double* top, const double* label, double* bottom_diff, - const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts); - -template -void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){ - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); - - size_t Global_Work_Size[1] = {(size_t)num}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void scal_gpu(cl_kernel Kernel, const int num, const float alpha, float* data); -template void scal_gpu(cl_kernel Kernel, const int num, const double alpha, double* data); - -template -void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype* label){ - OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) ); - OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) ); - OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) ); - OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) ); - - size_t Global_Work_Size[1] = {(size_t)num}; - size_t Local_Work_Size[1] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) ); -} - -template void diff_gpu(cl_kernel Kernel, const int num, const int dim, float* data, const float* label); -template void diff_gpu(cl_kernel Kernel, const int num, const int dim, double* data, const double* label); - -template -void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data); -template void max_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data); - -template -void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){ - std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*)&mask); - ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void MaxPoolForward(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask); -template void MaxPoolForward(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask); - -template -void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data) -{ - std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&idx_data); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void StoPoolForwardTrain(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data); -template void StoPoolForwardTrain(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data); - -template -void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){ - std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); - -} -template void StoPoolForwardTest(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data); -template void StoPoolForwardTest(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data); - -template -void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){ - std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} -template void AvePoolForward(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data); -template void AvePoolForward(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data); - -template -void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); - ret |= clSetKernelArg(Kernel, 10,sizeof(cl_int), (void*)&pad_); - ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} - -template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* top_data); -template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_,const int stride_,const int pad_, double* top_data); - -template -void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_size_); - ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_); - ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} - -template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff); -template void max_pool_bp_gpu(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff ); - -template -void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){ - std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&mask); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_mask); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_h); - ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&kernel_w); - ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_h); - ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&stride_w); - ret |= clSetKernelArg(Kernel,14, sizeof(cl_int), (void*)&pad_h); - ret |= clSetKernelArg(Kernel,15, sizeof(cl_int), (void*)&pad_w); - ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} - -template void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); -template void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); - -template -void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff) -{ - std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&num); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&stride_h); - ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_w); - ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&pad_h); - ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&pad_w); - ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} -template void AvePoolBackward(const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff); -template void AvePoolBackward(const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff); - -template -void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){ - std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&height); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&width); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_height); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_width); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_h); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_w); - ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_h); - ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_w); - ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)nthreads}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} -template void StoPoolBackward(const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff); -template void StoPoolBackward(const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff); - -template -void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){ - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_); - ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&pad_); - ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[]={(size_t)count}; - size_t uiLocal_Work_Size[]={256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL)); -} - -template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff); -template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff); - - -template -void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor){ - std::string kernel_name = "PReLUForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void PReLUForward(const int count, const int channels, const int dim,const float* bottom_data, float* top_data, const float* slope_data, const int div_factor); -template void PReLUForward(const int count, const int channels, const int dim,const double* bottom_data, double* top_data, const double* slope_data, const int div_factor); - -template -void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor){ - std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void PReLUBackward(const int count, const int channels, const int dim, const float* top_diff, const float* bottom_data, float* bottom_diff, const float* slope_data, const int div_factor); -template void PReLUBackward(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor); - -template -void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data, const int offset_in, Dtype* bottom_diff){ - std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&offset_out); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data); - ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&offset_in); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void PReLUParamBackward(const int count, const float* top_diff, const int offset_out, const float* bottom_data, const int offset_in, float* bottom_diff); -template void PReLUParamBackward(const int count, const double* top_diff, const int offset_out, const double* bottom_data, const int offset_in, double* bottom_diff); - - -template -void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){ - std::string kernel_name = "ReLUForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void ReLUForward(const int count, const float* bottom_data, float* top_data, float negative_slope); -template void ReLUForward(const int count, const double* bottom_data, double* top_data, double negative_slope); - -template -void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){ - std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} -template void ReLUBackward(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope); -template void ReLUBackward(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope); - -template -void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data){ - std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void SigmoidForward(const int count, const float* bottom_data, float* top_data); -template void SigmoidForward(const int count, const double* bottom_data, double* top_data); - -template -void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){ - std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} -template void SigmoidBackward(const int count, const float* top_diff, const float* top_data, float* bottom_diff); -template void SigmoidBackward(const int count, const double* top_diff, const double* top_data, double* bottom_diff); - -template -void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data){ - std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&threshold); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void ThresholdForward(const int count, const float threshold, const float* bottom_data, float* top_data); -template void ThresholdForward(const int count, const double threshold, const double* bottom_data, double* top_data); - -template -void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data){ - std::string kernel_name = "TanHForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void TanHForward(const int count, const float* bottom_data, float* top_data); -template void TanHForward(const int count, const double* bottom_data, double* top_data); - -template -void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){ - std::string kernel_name = "TanHBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)count}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} -template void TanHBackward(const int count, const float* top_diff, const float* top_data, float* bottom_diff); -template void TanHBackward(const int count, const double* top_diff, const double* top_data, double* bottom_diff); - -template + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) + { + std::string kernel_name = "SoftmaxLossBackwardGPU" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossBackwardGPU(const int nthreads, + const float* top, const float* label, float* bottom_diff, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, float* counts); +template void SoftmaxLossBackwardGPU(const int nthreads, + const double* top, const double* label, double* bottom_diff, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, double* counts); + +template +void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void scal_gpu(cl_kernel Kernel, const int num, + const float alpha, float* data); +template void scal_gpu(cl_kernel Kernel, const int num, + const double alpha, double* data); + +template +void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, + const Dtype* label) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, + float* data, const float* label); +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, + double* data, const double* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + float* top_data); +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + double* top_data); + +template +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask) { + std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolForward(const int count, const float* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data, int* mask, + float* top_mask); +template void MaxPoolForward(const int count, const double* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data, int* mask, + double* top_mask); + +template +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data) + { + std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void StoPoolForwardTrain(const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* idx_data, float* top_data); +template void StoPoolForwardTrain(const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* idx_data, double* top_data); + +template +void StoPoolForwardTest(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data) { + std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} +template void StoPoolForwardTest(const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* top_data); +template void StoPoolForwardTest(const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* top_data); + +template +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data) { + std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolForward(const int count, const float* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data); +template void AvePoolForward(const int count, const double* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* top_data); +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const float* top_data, const float* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, float* bottom_diff); +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const double* top_data, const double* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, double* bottom_diff); + +template +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolBackward(const int nthreads, + const float* const top_diff, const int* const mask, + const float* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); +template void MaxPoolBackward(const int nthreads, + const double* const top_diff, const int* const mask, + const double* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); + +template +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) + { + std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolBackward(const int nthreads, + const float* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); +template void AvePoolBackward(const int nthreads, + const double* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); + +template +void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff) { + std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void StoPoolBackward(const int nthreads, + const float* const rand_idx, const float* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + float* const bottom_diff); +template void StoPoolBackward(const int nthreads, + const double* const rand_idx, const double* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + double* const bottom_diff); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, + const float* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* bottom_diff); +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, + const double* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor) { + std::string kernel_name = "PReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUForward(const int count, const int channels, + const int dim, const float* bottom_data, float* top_data, + const float* slope_data, const int div_factor); +template void PReLUForward(const int count, const int channels, + const int dim, const double* bottom_data, double* top_data, + const double* slope_data, const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor) { + std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUBackward(const int count, const int channels, + const int dim, const float* top_diff, const float* bottom_data, + float* bottom_diff, const float* slope_data, const int div_factor); +template void PReLUBackward(const int count, const int channels, + const int dim, const double* top_diff, const double* bottom_data, + double* bottom_diff, const double* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff) { + std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUParamBackward(const int count, const float* top_diff, + const int offset_out, const float* bottom_data, const int offset_in, + float* bottom_diff); +template void PReLUParamBackward(const int count, + const double* top_diff, const int offset_out, const double* bottom_data, + const int offset_in, double* bottom_diff); + +template +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, + Dtype negative_slope) { + std::string kernel_name = "ReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void ReLUForward(const int count, const float* bottom_data, + float* top_data, float negative_slope); +template void ReLUForward(const int count, const double* bottom_data, + double* top_data, double negative_slope); + +template +void ReLUBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { + std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void ReLUBackward(const int count, const float* top_diff, + const float* bottom_data, float* bottom_diff, float negative_slope); +template void ReLUBackward(const int count, const double* top_diff, + const double* bottom_data, double* bottom_diff, double negative_slope); + +template +void SigmoidForward(const int count, const Dtype* bottom_data, + Dtype* top_data) { + std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SigmoidForward(const int count, const float* bottom_data, + float* top_data); +template void SigmoidForward(const int count, const double* bottom_data, + double* top_data); + +template +void SigmoidBackward(const int count, const Dtype* top_diff, + const Dtype* top_data, Dtype* bottom_diff) { + std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void SigmoidBackward(const int count, const float* top_diff, + const float* top_data, float* bottom_diff); +template void SigmoidBackward(const int count, const double* top_diff, + const double* top_data, double* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, + const Dtype* bottom_data, Dtype* top_data) { + std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void ThresholdForward(const int count, const float threshold, + const float* bottom_data, float* top_data); +template void ThresholdForward(const int count, const double threshold, + const double* bottom_data, double* top_data); + +template +void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) { + std::string kernel_name = "TanHForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void TanHForward(const int count, const float* bottom_data, + float* top_data); +template void TanHForward(const int count, const double* bottom_data, + double* top_data); + +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, + Dtype* bottom_diff) { + std::string kernel_name = "TanHBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void TanHBackward(const int count, const float* top_diff, + const float* top_data, float* bottom_diff); +template void TanHBackward(const int count, const double* top_diff, + const double* top_data, double* bottom_diff); + +template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { - std::string kernel_name = "opttrans" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int num_kernels = channels * height * width * optnum; - - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); -} - -template void opttrans(const float* data_im, const int im_offset, const int channels, - const int height, const int width, float* data_opt, const int opt_offset, const int optnum); -template void opttrans(const double* data_im, const int im_offset, const int channels, - const int height, const int width, double* data_opt, const int opt_offset, const int optnum); - -template -void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale){ - cl_int ret; - ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in); - ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num); - ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size); - ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k); - ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={(size_t)nthreads}; - size_t uiLocal_Work_Size[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) ); -} -template void LRNFillScale(cl_kernel kernel, const int nthreads, const float* const in, - const int num, const int channels, const int height, - const int width, const int size, const float alpha_over_size, - const float k, float* const scale); -template void LRNFillScale(cl_kernel kernel, const int nthreads, const double* const in, - const int num, const int channels, const int height, - const int width, const int size, const double alpha_over_size, - const double k, double* const scale); - -template + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int num_kernels = channels * height * width * optnum; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void opttrans(const float* data_im, const int im_offset, + const int channels, + const int height, const int width, float* data_opt, const int opt_offset, + const int optnum); +template void opttrans(const double* data_im, const int im_offset, + const int channels, + const int height, const int width, double* data_opt, const int opt_offset, + const int optnum); + +template +void LRNFillScale(cl_kernel LFSkernel, const int nthreads, + const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale) { + cl_int ret; + ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); + ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); + ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void LRNFillScale(cl_kernel kernel, const int nthreads, + const float* const in, + const int num, const int channels, const int height, + const int width, const int size, const float alpha_over_size, + const float k, float* const scale); +template void LRNFillScale(cl_kernel kernel, const int nthreads, + const double* const in, + const int num, const int channels, const int height, + const int width, const int size, const double alpha_over_size, + const double k, double* const scale); + +template void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out){ - cl_int ret; - ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); - ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale); - ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta); - ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[]={(size_t)nthreads}; - size_t uiLocal_Work_Size2[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); -} -template void LRNComputeOutput(cl_kernel kernel, int nthreads, const float* in, - float* scale, float negative_beta, float* out); -template void LRNComputeOutput(cl_kernel kernel, int nthreads, const double* in, - double* scale, double negative_beta, double* out); - -template + Dtype* scale, Dtype negative_beta, Dtype* out) { + cl_int ret; + ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); +} +template void LRNComputeOutput(cl_kernel kernel, int nthreads, + const float* in, + float* scale, float negative_beta, float* out); +template void LRNComputeOutput(cl_kernel kernel, int nthreads, + const double* in, + double* scale, double negative_beta, double* out); + +template void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff){ - cl_int ret; - ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data); - ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data); - ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale); - ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff); - ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num); - ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta); - ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio); - ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={(size_t)nthreads}; - size_t uiLocal_Work_Size[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) ); + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff) { + cl_int ret; + ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); + ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void LRNComputeDiff(cl_kernel kernel, const int nthreads, - const float* const bottom_data, const float* const top_data, - const float* const scale, const float* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const float negative_beta, - const float cache_ratio, float* const bottom_diff); + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const float negative_beta, + const float cache_ratio, float* const bottom_diff); template void LRNComputeDiff(cl_kernel kernel, const int nthreads, - const double* const bottom_data, const double* const top_data, - const double* const scale, const double* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const double negative_beta, - const double cache_ratio, double* const bottom_diff); - -template -void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ - std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add (const int n, const float* in1, const float* in2, float* y); -template void caffe_gpu_add (const int n, const double* in1, const double* in2, double* y); - -template -void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ){ - std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)N}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y ); -template void caffe_gpu_sign_ocl(const int N, const double* X, double* Y ); - -template -void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ){ - std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)N}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y ); -template void caffe_gpu_abs_ocl(const int N, const double* X, double* Y ); - -template -void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = "div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_div (const int n, const float* a, const float* b, float* y); -template void caffe_gpu_div (const int n, const double* a, const double* b, double* y); - -template -void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ - std::string kernel_name = "add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add_scalar (const int n, const float alpha, float* top_data); -template void caffe_gpu_add_scalar (const int n, const double alpha, double* top_data); - -template -void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = "element_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_mul (const int n, const float* a, const float* b, float* y); -template void caffe_gpu_mul (const int n, const double* a, const double* b, double* y); - -template -void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ - std::string kernel_name = "powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_powx (const int n, const float* a, const float alpha, float* y); -template void caffe_gpu_powx (const int n, const double* a, const double alpha, double* y); - -template -void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) -{ - std::string kernel_name = "DropoutForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); - ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); - ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); - ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_); - ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void DropoutForward(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); -template void DropoutForward(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); - -template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) -{ - std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); - ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); - ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_); - ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); -template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); - - -template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) -{ - std::string kernel_name = "BNLLForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void BNLLForward(const int count, const float* bottom_data, float *top_data); -template void BNLLForward(const int count, const double* bottom_data, double *top_data); - -template -void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff) -{ - std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(kernel,3,sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void BNLLBackward(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff); -template void BNLLBackward(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff); - - -template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data) -{ - std::string kernel_name = "Concat" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&in_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*)&forward); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&num_concats); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&concat_size); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&top_concat_axis); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&bottom_concat_axis); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&offset_concat_axis); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&out_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)nthreads}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void Concat(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data); -template void Concat(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data); - -template + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const double negative_beta, + const double cache_ratio, double* const bottom_diff); + +template +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add(const int n, const float* in1, + const float* in2, float* y); +template void caffe_gpu_add(const int n, const double* in1, + const double* in2, double* y); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_sign_ocl(const int N, const double* X, + double* Y); + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_abs_ocl(const int N, const double* X, + double* Y); + +template +void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_div(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_div(const int n, const double* a, + const double* b, double* y); + +template +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) { + std::string kernel_name = "add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add_scalar(const int n, const float alpha, + float* top_data); +template void caffe_gpu_add_scalar(const int n, const double alpha, + double* top_data); + +template +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "element_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_mul(const int n, const double* a, + const double* b, double* y); + +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { + std::string kernel_name = "powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx(const int n, const float* a, + const float alpha, float* y); +template void caffe_gpu_powx(const int n, const double* a, + const double alpha, double* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, + const int* MaskMem, const Dtype scale_, Dtype* top_data) + { + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void DropoutForward(const int count, const float* bottom_data, + const int* MaskMem, const float scale_, float* top_data); +template void DropoutForward(const int count, const double* bottom_data, + const int* MaskMem, const double scale_, double* top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, + const float threshold_, const Dtype scale_, Dtype* bottom_diff) + { + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void DropoutBackward(const int count, const float* top_diff, + const int* MaskMem, const float threshold_, const float scale_, + float* bottom_diff); +template void DropoutBackward(const int count, const double* top_diff, + const int* MaskMem, const float threshold_, const double scale_, + double* bottom_diff); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) + { + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLForward(const int count, const float* bottom_data, + float *top_data); +template void BNLLForward(const int count, const double* bottom_data, + double *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff) + { + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLBackward(const int count, const float* top_diff, + const float* bottom_data, float *bottom_diff); +template void BNLLBackward(const int count, const double* top_diff, + const double* bottom_data, double *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, Dtype *out_data) + { + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*) &forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Concat(const int nthreads, const float* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, float *out_data); +template void Concat(const int nthreads, const double* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, double *out_data); + +template void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) -{ - std::string kernel_name = "CLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&channels); - ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*)&margin); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*)&legacy_version); - ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&y); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&diff); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&dist_sq); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff) + { + std::string kernel_name = "CLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - const float* y, const float* diff, const float* dist_sq, - float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + const float* y, const float* diff, const float* dist_sq, + float *bottom_diff); template void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - const double* y, const double* diff, const double* dist_sq, - double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + const double* y, const double* diff, const double* dist_sq, + double *bottom_diff); -template +template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) -{ - std::string kernel_name = "MaxForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&bottom_data_a); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&bottom_data_b); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&blob_idx); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&top_data); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)nthreads}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask) + { + std::string kernel_name = "MaxForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxForward(const int nthreads, const float* bottom_data_a, - const float* bottom_data_b, const int blob_idx, float* top_data, - int* mask); -template void MaxForward(const int nthreads, const double* bottom_data_a, - const double* bottom_data_b, const int blob_idx, double* top_data, - int* mask); - -template + const float* bottom_data_b, const int blob_idx, float* top_data, + int* mask); +template void MaxForward(const int nthreads, + const double* bottom_data_a, + const double* bottom_data_b, const int blob_idx, double* top_data, + int* mask); + +template void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) -{ - std::string kernel_name = "MaxBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&blob_idx); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)nthreads}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void MaxBackward(const int nthreads, const float* top_diff, const int blob_idx, const int* mask, float* bottom_diff); -template void MaxBackward(const int nthreads, const double* top_diff, const int blob_idx, const int* mask, double* bottom_diff); - - -template -void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) -{ -} -template void ocl_conv(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); -template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); - + const int blob_idx, const int* mask, Dtype* bottom_diff) + { + std::string kernel_name = "MaxBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void MaxBackward(const int nthreads, const float* top_diff, + const int blob_idx, const int* mask, float* bottom_diff); +template void MaxBackward(const int nthreads, const double* top_diff, + const int blob_idx, const int* mask, double* bottom_diff); + +template +void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, + int channel_in, int width, int height, int channel_out, int width_out, + int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) + { +} +template void ocl_conv(float* bottom_data, float* top_data, + float* weights, float* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); +template void ocl_conv(double* bottom_data, double* top_data, + double* weights, double* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); } // namespace caffe diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp index 38a06026..f4373901 100644 --- a/src/caffe/util/upgrade_proto.cpp +++ b/src/caffe/util/upgrade_proto.cpp @@ -13,540 +13,564 @@ namespace caffe { bool NetNeedsUpgrade(const NetParameter& net_param) { - return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param); + return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param); } bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param) { - for (int i = 0; i < net_param.layers_size(); ++i) { - if (net_param.layers(i).has_layer()) { - return true; - } - } - return false; + for (int i = 0; i < net_param.layers_size(); ++i) { + if (net_param.layers(i).has_layer()) { + return true; + } + } + return false; } bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) { - return net_param.layers_size() > 0; + return net_param.layers_size() > 0; } bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, - NetParameter* net_param) { - // First upgrade padding layers to padded conv layers. - NetParameter v0_net_param; - UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); - // Now upgrade layer parameters. - bool is_fully_compatible = true; - net_param->Clear(); - if (v0_net_param.has_name()) { - net_param->set_name(v0_net_param.name()); - } - for (int i = 0; i < v0_net_param.layers_size(); ++i) { - is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), - net_param->add_layers()); - } - for (int i = 0; i < v0_net_param.input_size(); ++i) { - net_param->add_input(v0_net_param.input(i)); - } - for (int i = 0; i < v0_net_param.input_dim_size(); ++i) { - net_param->add_input_dim(v0_net_param.input_dim(i)); - } - if (v0_net_param.has_force_backward()) { - net_param->set_force_backward(v0_net_param.force_backward()); - } - return is_fully_compatible; + NetParameter* net_param) { + // First upgrade padding layers to padded conv layers. + NetParameter v0_net_param; + UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); + // Now upgrade layer parameters. + bool is_fully_compatible = true; + net_param->Clear(); + if (v0_net_param.has_name()) { + net_param->set_name(v0_net_param.name()); + } + for (int i = 0; i < v0_net_param.layers_size(); ++i) { + is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), + net_param->add_layers()); + } + for (int i = 0; i < v0_net_param.input_size(); ++i) { + net_param->add_input(v0_net_param.input(i)); + } + for (int i = 0; i < v0_net_param.input_dim_size(); ++i) { + net_param->add_input_dim(v0_net_param.input_dim(i)); + } + if (v0_net_param.has_force_backward()) { + net_param->set_force_backward(v0_net_param.force_backward()); + } + return is_fully_compatible; } void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad) { - // Copy everything other than the layers from the original param. - param_upgraded_pad->Clear(); - param_upgraded_pad->CopyFrom(param); - param_upgraded_pad->clear_layers(); - // Figure out which layer each bottom blob comes from. - map blob_name_to_last_top_idx; - for (int i = 0; i < param.input_size(); ++i) { - const string& blob_name = param.input(i); - blob_name_to_last_top_idx[blob_name] = -1; - } - for (int i = 0; i < param.layers_size(); ++i) { - const V1LayerParameter& layer_connection = param.layers(i); - const V0LayerParameter& layer_param = layer_connection.layer(); - // Add the layer to the new net, unless it's a padding layer. - if (layer_param.type() != "padding") { - param_upgraded_pad->add_layers()->CopyFrom(layer_connection); - } - for (int j = 0; j < layer_connection.bottom_size(); ++j) { - const string& blob_name = layer_connection.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { - LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; - } - const int top_idx = blob_name_to_last_top_idx[blob_name]; - if (top_idx == -1) { - continue; - } - const V1LayerParameter& source_layer = param.layers(top_idx); - if (source_layer.layer().type() == "padding") { - // This layer has a padding layer as input -- check that it is a conv - // layer or a pooling layer and takes only one input. Also check that - // the padding layer input has only one input and one output. Other - // cases have undefined behavior in Caffe. - CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) - << "Padding layer input to " - "non-convolutional / non-pooling layer type " - << layer_param.type(); - CHECK_EQ(layer_connection.bottom_size(), 1) - << "Conv Layer takes a single blob as input."; - CHECK_EQ(source_layer.bottom_size(), 1) - << "Padding Layer takes a single blob as input."; - CHECK_EQ(source_layer.top_size(), 1) - << "Padding Layer produces a single blob as output."; - int layer_index = param_upgraded_pad->layers_size() - 1; - param_upgraded_pad->mutable_layers(layer_index)->mutable_layer() - ->set_pad(source_layer.layer().pad()); - param_upgraded_pad->mutable_layers(layer_index) - ->set_bottom(j, source_layer.bottom(0)); - } - } - for (int j = 0; j < layer_connection.top_size(); ++j) { - const string& blob_name = layer_connection.top(j); - blob_name_to_last_top_idx[blob_name] = i; - } - } + NetParameter* param_upgraded_pad) { + // Copy everything other than the layers from the original param. + param_upgraded_pad->Clear(); + param_upgraded_pad->CopyFrom(param); + param_upgraded_pad->clear_layers(); + // Figure out which layer each bottom blob comes from. + map blob_name_to_last_top_idx; + for (int i = 0; i < param.input_size(); ++i) { + const string& blob_name = param.input(i); + blob_name_to_last_top_idx[blob_name] = -1; + } + for (int i = 0; i < param.layers_size(); ++i) { + const V1LayerParameter& layer_connection = param.layers(i); + const V0LayerParameter& layer_param = layer_connection.layer(); + // Add the layer to the new net, unless it's a padding layer. + if (layer_param.type() != "padding") { + param_upgraded_pad->add_layers()->CopyFrom(layer_connection); + } + for (int j = 0; j < layer_connection.bottom_size(); ++j) { + const string& blob_name = layer_connection.bottom(j); + if (blob_name_to_last_top_idx.find(blob_name) == + blob_name_to_last_top_idx.end()) { + LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; + } + const int top_idx = blob_name_to_last_top_idx[blob_name]; + if (top_idx == -1) { + continue; + } + const V1LayerParameter& source_layer = param.layers(top_idx); + if (source_layer.layer().type() == "padding") { + // This layer has a padding layer as input -- check that it is a conv + // layer or a pooling layer and takes only one input. Also check that + // the padding layer input has only one input and one output. Other + // cases have undefined behavior in Caffe. + CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) + << "Padding layer input to " + "non-convolutional / non-pooling layer type " + << layer_param.type(); + CHECK_EQ(layer_connection.bottom_size(), 1) + << "Conv Layer takes a single blob as input."; + CHECK_EQ(source_layer.bottom_size(), 1) + << "Padding Layer takes a single blob as input."; + CHECK_EQ(source_layer.top_size(), 1) + << "Padding Layer produces a single blob as output."; + int layer_index = param_upgraded_pad->layers_size() - 1; + param_upgraded_pad->mutable_layers(layer_index)->mutable_layer() + ->set_pad(source_layer.layer().pad()); + param_upgraded_pad->mutable_layers(layer_index) + ->set_bottom(j, source_layer.bottom(0)); + } + } + for (int j = 0; j < layer_connection.top_size(); ++j) { + const string& blob_name = layer_connection.top(j); + blob_name_to_last_top_idx[blob_name] = i; + } + } } bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param) { - bool is_fully_compatible = true; - layer_param->Clear(); - for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { - layer_param->add_bottom(v0_layer_connection.bottom(i)); - } - for (int i = 0; i < v0_layer_connection.top_size(); ++i) { - layer_param->add_top(v0_layer_connection.top(i)); - } - if (v0_layer_connection.has_layer()) { - const V0LayerParameter& v0_layer_param = v0_layer_connection.layer(); - if (v0_layer_param.has_name()) { - layer_param->set_name(v0_layer_param.name()); - } - const string& type = v0_layer_param.type(); - if (v0_layer_param.has_type()) { - layer_param->set_type(UpgradeV0LayerType(type)); - } - for (int i = 0; i < v0_layer_param.blobs_size(); ++i) { - layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i)); - } - for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) { - layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i)); - } - for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) { - layer_param->add_weight_decay(v0_layer_param.weight_decay(i)); - } - if (v0_layer_param.has_num_output()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_num_output( - v0_layer_param.num_output()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()->set_num_output( - v0_layer_param.num_output()); - } else { - LOG(ERROR) << "Unknown parameter num_output for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_biasterm()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_bias_term( - v0_layer_param.biasterm()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()->set_bias_term( - v0_layer_param.biasterm()); - } else { - LOG(ERROR) << "Unknown parameter biasterm for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_weight_filler()) { - if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); - } else { - LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_bias_filler()) { - if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); - } else { - LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_pad()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad()); - } else { - LOG(ERROR) << "Unknown parameter pad for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_kernelsize()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_kernel_size( - v0_layer_param.kernelsize()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_kernel_size( - v0_layer_param.kernelsize()); - } else { - LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_group()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_group( - v0_layer_param.group()); - } else { - LOG(ERROR) << "Unknown parameter group for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_stride()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_stride( - v0_layer_param.stride()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_stride( - v0_layer_param.stride()); - } else { - LOG(ERROR) << "Unknown parameter stride for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_pool()) { - if (type == "pool") { - V0LayerParameter_PoolMethod pool = v0_layer_param.pool(); - switch (pool) { - case V0LayerParameter_PoolMethod_MAX: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); - break; - case V0LayerParameter_PoolMethod_AVE: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - break; - case V0LayerParameter_PoolMethod_STOCHASTIC: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); - break; - default: - LOG(ERROR) << "Unknown pool method " << pool; - is_fully_compatible = false; - } - } else { - LOG(ERROR) << "Unknown parameter pool for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_dropout_ratio()) { - if (type == "dropout") { - layer_param->mutable_dropout_param()->set_dropout_ratio( - v0_layer_param.dropout_ratio()); - } else { - LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_local_size()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_local_size( - v0_layer_param.local_size()); - } else { - LOG(ERROR) << "Unknown parameter local_size for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_alpha()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha()); - } else { - LOG(ERROR) << "Unknown parameter alpha for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_beta()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta()); - } else { - LOG(ERROR) << "Unknown parameter beta for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_k()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_k(v0_layer_param.k()); - } else { - LOG(ERROR) << "Unknown parameter k for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_source()) { - if (type == "data") { - layer_param->mutable_data_param()->set_source(v0_layer_param.source()); - } else if (type == "hdf5_data") { - layer_param->mutable_hdf5_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "window_data") { - layer_param->mutable_window_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "infogain_loss") { - layer_param->mutable_infogain_loss_param()->set_source( - v0_layer_param.source()); - } else { - LOG(ERROR) << "Unknown parameter source for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_scale()) { - layer_param->mutable_transform_param()-> - set_scale(v0_layer_param.scale()); - } - if (v0_layer_param.has_meanfile()) { - layer_param->mutable_transform_param()-> - set_mean_file(v0_layer_param.meanfile()); - } - if (v0_layer_param.has_batchsize()) { - if (type == "data") { - layer_param->mutable_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "hdf5_data") { - layer_param->mutable_hdf5_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "window_data") { - layer_param->mutable_window_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else { - LOG(ERROR) << "Unknown parameter batchsize for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_cropsize()) { - layer_param->mutable_transform_param()-> - set_crop_size(v0_layer_param.cropsize()); - } - if (v0_layer_param.has_mirror()) { - layer_param->mutable_transform_param()-> - set_mirror(v0_layer_param.mirror()); - } - if (v0_layer_param.has_rand_skip()) { - if (type == "data") { - layer_param->mutable_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); - } else { - LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_shuffle_images()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_shuffle( - v0_layer_param.shuffle_images()); - } else { - LOG(ERROR) << "Unknown parameter shuffle for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_new_height()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_new_height( - v0_layer_param.new_height()); - } else { - LOG(ERROR) << "Unknown parameter new_height for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_new_width()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_new_width( - v0_layer_param.new_width()); - } else { - LOG(ERROR) << "Unknown parameter new_width for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_concat_dim()) { - if (type == "concat") { - layer_param->mutable_concat_param()->set_concat_dim( - v0_layer_param.concat_dim()); - } else { - LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_fg_threshold()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_fg_threshold( - v0_layer_param.det_fg_threshold()); - } else { - LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_bg_threshold()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_bg_threshold( - v0_layer_param.det_bg_threshold()); - } else { - LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_fg_fraction()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_fg_fraction( - v0_layer_param.det_fg_fraction()); - } else { - LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_context_pad()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_context_pad( - v0_layer_param.det_context_pad()); - } else { - LOG(ERROR) << "Unknown parameter det_context_pad for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_crop_mode()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_crop_mode( - v0_layer_param.det_crop_mode()); - } else { - LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_hdf5_output_param()) { - if (type == "hdf5_output") { - layer_param->mutable_hdf5_output_param()->CopyFrom( - v0_layer_param.hdf5_output_param()); - } else { - LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " - << type; - is_fully_compatible = false; - } - } - } - return is_fully_compatible; + V1LayerParameter* layer_param) { + bool is_fully_compatible = true; + layer_param->Clear(); + for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { + layer_param->add_bottom(v0_layer_connection.bottom(i)); + } + for (int i = 0; i < v0_layer_connection.top_size(); ++i) { + layer_param->add_top(v0_layer_connection.top(i)); + } + if (v0_layer_connection.has_layer()) { + const V0LayerParameter& v0_layer_param = v0_layer_connection.layer(); + if (v0_layer_param.has_name()) { + layer_param->set_name(v0_layer_param.name()); + } + const string& type = v0_layer_param.type(); + if (v0_layer_param.has_type()) { + layer_param->set_type(UpgradeV0LayerType(type)); + } + for (int i = 0; i < v0_layer_param.blobs_size(); ++i) { + layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i)); + } + for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) { + layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i)); + } + for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) { + layer_param->add_weight_decay(v0_layer_param.weight_decay(i)); + } + if (v0_layer_param.has_num_output()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_num_output( + v0_layer_param.num_output()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->set_num_output( + v0_layer_param.num_output()); + } else { + LOG(ERROR) << "Unknown parameter num_output for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_biasterm()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_bias_term( + v0_layer_param.biasterm()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->set_bias_term( + v0_layer_param.biasterm()); + } else { + LOG(ERROR) << "Unknown parameter biasterm for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_weight_filler()) { + if (type == "conv") { + layer_param->mutable_convolution_param()-> + mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()-> + mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + } else { + LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_bias_filler()) { + if (type == "conv") { + layer_param->mutable_convolution_param()-> + mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()-> + mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + } else { + LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_pad()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad()); + } else { + LOG(ERROR) << "Unknown parameter pad for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_kernelsize()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_kernel_size( + v0_layer_param.kernelsize()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_kernel_size( + v0_layer_param.kernelsize()); + } else { + LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_group()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_group( + v0_layer_param.group()); + } else { + LOG(ERROR) << "Unknown parameter group for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_stride()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_stride( + v0_layer_param.stride()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_stride( + v0_layer_param.stride()); + } else { + LOG(ERROR) << "Unknown parameter stride for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_pool()) { + if (type == "pool") { + V0LayerParameter_PoolMethod pool = v0_layer_param.pool(); + switch (pool) { + case V0LayerParameter_PoolMethod_MAX: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_MAX); + break; + case V0LayerParameter_PoolMethod_AVE: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + break; + case V0LayerParameter_PoolMethod_STOCHASTIC: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_STOCHASTIC); + break; + default: + LOG(ERROR) << "Unknown pool method " << pool; + is_fully_compatible = false; + } + } else { + LOG(ERROR) << "Unknown parameter pool for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_dropout_ratio()) { + if (type == "dropout") { + layer_param->mutable_dropout_param()->set_dropout_ratio( + v0_layer_param.dropout_ratio()); + } else { + LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_local_size()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_local_size( + v0_layer_param.local_size()); + } else { + LOG(ERROR) << "Unknown parameter local_size for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_alpha()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha()); + } else { + LOG(ERROR) << "Unknown parameter alpha for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_beta()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta()); + } else { + LOG(ERROR) << "Unknown parameter beta for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_k()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_k(v0_layer_param.k()); + } else { + LOG(ERROR) << "Unknown parameter k for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_source()) { + if (type == "data") { + layer_param->mutable_data_param()->set_source(v0_layer_param.source()); + } else if (type == "hdf5_data") { + layer_param->mutable_hdf5_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "window_data") { + layer_param->mutable_window_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "infogain_loss") { + layer_param->mutable_infogain_loss_param()->set_source( + v0_layer_param.source()); + } else { + LOG(ERROR) << "Unknown parameter source for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_scale()) { + layer_param->mutable_transform_param()-> + set_scale(v0_layer_param.scale()); + } + if (v0_layer_param.has_meanfile()) { + layer_param->mutable_transform_param()-> + set_mean_file(v0_layer_param.meanfile()); + } + if (v0_layer_param.has_batchsize()) { + if (type == "data") { + layer_param->mutable_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "hdf5_data") { + layer_param->mutable_hdf5_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "window_data") { + layer_param->mutable_window_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else { + LOG(ERROR) << "Unknown parameter batchsize for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_cropsize()) { + layer_param->mutable_transform_param()-> + set_crop_size(v0_layer_param.cropsize()); + } + if (v0_layer_param.has_mirror()) { + layer_param->mutable_transform_param()-> + set_mirror(v0_layer_param.mirror()); + } + if (v0_layer_param.has_rand_skip()) { + if (type == "data") { + layer_param->mutable_data_param()->set_rand_skip( + v0_layer_param.rand_skip()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_rand_skip( + v0_layer_param.rand_skip()); + } else { + LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_shuffle_images()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_shuffle( + v0_layer_param.shuffle_images()); + } else { + LOG(ERROR) << "Unknown parameter shuffle for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_new_height()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_new_height( + v0_layer_param.new_height()); + } else { + LOG(ERROR) << "Unknown parameter new_height for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_new_width()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_new_width( + v0_layer_param.new_width()); + } else { + LOG(ERROR) << "Unknown parameter new_width for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_concat_dim()) { + if (type == "concat") { + layer_param->mutable_concat_param()->set_concat_dim( + v0_layer_param.concat_dim()); + } else { + LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_fg_threshold()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_fg_threshold( + v0_layer_param.det_fg_threshold()); + } else { + LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_bg_threshold()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_bg_threshold( + v0_layer_param.det_bg_threshold()); + } else { + LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_fg_fraction()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_fg_fraction( + v0_layer_param.det_fg_fraction()); + } else { + LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_context_pad()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_context_pad( + v0_layer_param.det_context_pad()); + } else { + LOG(ERROR) << "Unknown parameter det_context_pad for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_crop_mode()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_crop_mode( + v0_layer_param.det_crop_mode()); + } else { + LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_hdf5_output_param()) { + if (type == "hdf5_output") { + layer_param->mutable_hdf5_output_param()->CopyFrom( + v0_layer_param.hdf5_output_param()); + } else { + LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " + << type; + is_fully_compatible = false; + } + } + } + return is_fully_compatible; } V1LayerParameter_LayerType UpgradeV0LayerType(const string& type) { - if (type == "accuracy") { - return V1LayerParameter_LayerType_ACCURACY; - } else if (type == "bnll") { - return V1LayerParameter_LayerType_BNLL; - } else if (type == "concat") { - return V1LayerParameter_LayerType_CONCAT; - } else if (type == "conv") { - return V1LayerParameter_LayerType_CONVOLUTION; - } else if (type == "data") { - return V1LayerParameter_LayerType_DATA; - } else if (type == "dropout") { - return V1LayerParameter_LayerType_DROPOUT; - } else if (type == "euclidean_loss") { - return V1LayerParameter_LayerType_EUCLIDEAN_LOSS; - } else if (type == "flatten") { - return V1LayerParameter_LayerType_FLATTEN; - } else if (type == "hdf5_data") { - return V1LayerParameter_LayerType_HDF5_DATA; - } else if (type == "hdf5_output") { - return V1LayerParameter_LayerType_HDF5_OUTPUT; - } else if (type == "im2col") { - return V1LayerParameter_LayerType_IM2COL; - } else if (type == "images") { - return V1LayerParameter_LayerType_IMAGE_DATA; - } else if (type == "infogain_loss") { - return V1LayerParameter_LayerType_INFOGAIN_LOSS; - } else if (type == "innerproduct") { - return V1LayerParameter_LayerType_INNER_PRODUCT; - } else if (type == "lrn") { - return V1LayerParameter_LayerType_LRN; - } else if (type == "multinomial_logistic_loss") { - return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; - } else if (type == "pool") { - return V1LayerParameter_LayerType_POOLING; - } else if (type == "relu") { - return V1LayerParameter_LayerType_RELU; - } else if (type == "sigmoid") { - return V1LayerParameter_LayerType_SIGMOID; - } else if (type == "softmax") { - return V1LayerParameter_LayerType_SOFTMAX; - } else if (type == "softmax_loss") { - return V1LayerParameter_LayerType_SOFTMAX_LOSS; - } else if (type == "split") { - return V1LayerParameter_LayerType_SPLIT; - } else if (type == "tanh") { - return V1LayerParameter_LayerType_TANH; - } else if (type == "window_data") { - return V1LayerParameter_LayerType_WINDOW_DATA; - } else { - LOG(FATAL) << "Unknown layer name: " << type; - return V1LayerParameter_LayerType_NONE; - } + if (type == "accuracy") { + return V1LayerParameter_LayerType_ACCURACY; + } else if (type == "bnll") { + return V1LayerParameter_LayerType_BNLL; + } else if (type == "concat") { + return V1LayerParameter_LayerType_CONCAT; + } else if (type == "conv") { + return V1LayerParameter_LayerType_CONVOLUTION; + } else if (type == "data") { + return V1LayerParameter_LayerType_DATA; + } else if (type == "dropout") { + return V1LayerParameter_LayerType_DROPOUT; + } else if (type == "euclidean_loss") { + return V1LayerParameter_LayerType_EUCLIDEAN_LOSS; + } else if (type == "flatten") { + return V1LayerParameter_LayerType_FLATTEN; + } else if (type == "hdf5_data") { + return V1LayerParameter_LayerType_HDF5_DATA; + } else if (type == "hdf5_output") { + return V1LayerParameter_LayerType_HDF5_OUTPUT; + } else if (type == "im2col") { + return V1LayerParameter_LayerType_IM2COL; + } else if (type == "images") { + return V1LayerParameter_LayerType_IMAGE_DATA; + } else if (type == "infogain_loss") { + return V1LayerParameter_LayerType_INFOGAIN_LOSS; + } else if (type == "innerproduct") { + return V1LayerParameter_LayerType_INNER_PRODUCT; + } else if (type == "lrn") { + return V1LayerParameter_LayerType_LRN; + } else if (type == "multinomial_logistic_loss") { + return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; + } else if (type == "pool") { + return V1LayerParameter_LayerType_POOLING; + } else if (type == "relu") { + return V1LayerParameter_LayerType_RELU; + } else if (type == "sigmoid") { + return V1LayerParameter_LayerType_SIGMOID; + } else if (type == "softmax") { + return V1LayerParameter_LayerType_SOFTMAX; + } else if (type == "softmax_loss") { + return V1LayerParameter_LayerType_SOFTMAX_LOSS; + } else if (type == "split") { + return V1LayerParameter_LayerType_SPLIT; + } else if (type == "tanh") { + return V1LayerParameter_LayerType_TANH; + } else if (type == "window_data") { + return V1LayerParameter_LayerType_WINDOW_DATA; + } else { + LOG(FATAL) << "Unknown layer name: " << type; + return V1LayerParameter_LayerType_NONE; + } } bool NetNeedsDataUpgrade(const NetParameter& net_param) { - for (int i = 0; i < net_param.layers_size(); ++i) { - if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) { - DataParameter layer_param = net_param.layers(i).data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } - } - if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) { - ImageDataParameter layer_param = net_param.layers(i).image_data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } - } - if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) { - WindowDataParameter layer_param = net_param.layers(i).window_data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } - } - } - return false; + for (int i = 0; i < net_param.layers_size(); ++i) { + if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) { + DataParameter layer_param = net_param.layers(i).data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) { + ImageDataParameter layer_param = net_param.layers(i).image_data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) { + WindowDataParameter layer_param = net_param.layers(i).window_data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + } + return false; } #define CONVERT_LAYER_TRANSFORM_PARAM(TYPE, Name, param_name) \ @@ -576,365 +600,373 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) { } while (0) void UpgradeNetDataTransformation(NetParameter* net_param) { - for (int i = 0; i < net_param->layers_size(); ++i) { - CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data); - CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data); - CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data); - } + for (int i = 0; i < net_param->layers_size(); ++i) { + CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data); + CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data); + CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data); + } } bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { - bool success = true; - if (NetNeedsV0ToV1Upgrade(*param)) { - // NetParameter was specified using the old style (V0LayerParameter); try to - // upgrade it. - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V0LayerParameter: " << param_file; - NetParameter original_param(*param); - if (!UpgradeV0Net(original_param, param)) { - success = false; - LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V0NetParameter to NetParameter (see above); continuing anyway."; - } else { - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V0LayerParameter"; - } - LOG(ERROR) << "Note that future Caffe releases will not support " - << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " - << "prototxt and ./build/tools/upgrade_net_proto_binary for model " - << "weights upgrade this and any other net protos to the new format."; - } - // NetParameter uses old style data transformation fields; try to upgrade it. - if (NetNeedsDataUpgrade(*param)) { - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "transformation parameters: " << param_file; - UpgradeNetDataTransformation(param); - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "data transformation parameters."; - LOG(ERROR) << "Note that future Caffe releases will only support " - << "transform_param messages for transformation fields."; - } - if (NetNeedsV1ToV2Upgrade(*param)) { - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V1LayerParameter: " << param_file; - NetParameter original_param(*param); - if (!UpgradeV1Net(original_param, param)) { - success = false; - LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V1LayerParameter (see above); continuing anyway."; - } else { - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V1LayerParameter"; - } - } - return success; + bool success = true; + if (NetNeedsV0ToV1Upgrade(*param)) { + // NetParameter was specified using the old style (V0LayerParameter); try to + // upgrade it. + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "V0LayerParameter: " << param_file; + NetParameter original_param(*param); + if (!UpgradeV0Net(original_param, param)) { + success = false; + LOG(ERROR) << "Warning: had one or more problems upgrading " + << "V0NetParameter to NetParameter (see above); continuing anyway."; + } else { + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "V0LayerParameter"; + } + LOG(ERROR) << "Note that future Caffe releases will not support " + << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " + << "prototxt and ./build/tools/upgrade_net_proto_binary for model " + << "weights upgrade this and any other net protos to the new format."; + } + // NetParameter uses old style data transformation fields; try to upgrade it. + if (NetNeedsDataUpgrade(*param)) { + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "transformation parameters: " << param_file; + UpgradeNetDataTransformation(param); + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "data transformation parameters."; + LOG(ERROR) << "Note that future Caffe releases will only support " + << "transform_param messages for transformation fields."; + } + if (NetNeedsV1ToV2Upgrade(*param)) { + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "V1LayerParameter: " << param_file; + NetParameter original_param(*param); + if (!UpgradeV1Net(original_param, param)) { + success = false; + LOG(ERROR) << "Warning: had one or more problems upgrading " + << "V1LayerParameter (see above); continuing anyway."; + } else { + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "V1LayerParameter"; + } + } + return success; } bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { - bool is_fully_compatible = true; - if (v1_net_param.layer_size() > 0) { - LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " - << "fields; these will be ignored for the upgrade."; - is_fully_compatible = false; - } - net_param->CopyFrom(v1_net_param); - net_param->clear_layers(); - net_param->clear_layer(); - for (int i = 0; i < v1_net_param.layers_size(); ++i) { - if (!UpgradeV1LayerParameter(v1_net_param.layers(i), - net_param->add_layer())) { - LOG(ERROR) << "Upgrade of input layer " << i << " failed."; - is_fully_compatible = false; - } - } - return is_fully_compatible; + bool is_fully_compatible = true; + if (v1_net_param.layer_size() > 0) { + LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " + << "fields; these will be ignored for the upgrade."; + is_fully_compatible = false; + } + net_param->CopyFrom(v1_net_param); + net_param->clear_layers(); + net_param->clear_layer(); + for (int i = 0; i < v1_net_param.layers_size(); ++i) { + if (!UpgradeV1LayerParameter(v1_net_param.layers(i), + net_param->add_layer())) { + LOG(ERROR) << "Upgrade of input layer " << i << " failed."; + is_fully_compatible = false; + } + } + return is_fully_compatible; } bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param) { - layer_param->Clear(); - bool is_fully_compatible = true; - for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { - layer_param->add_bottom(v1_layer_param.bottom(i)); - } - for (int i = 0; i < v1_layer_param.top_size(); ++i) { - layer_param->add_top(v1_layer_param.top(i)); - } - if (v1_layer_param.has_name()) { - layer_param->set_name(v1_layer_param.name()); - } - for (int i = 0; i < v1_layer_param.include_size(); ++i) { - layer_param->add_include()->CopyFrom(v1_layer_param.include(i)); - } - for (int i = 0; i < v1_layer_param.exclude_size(); ++i) { - layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i)); - } - if (v1_layer_param.has_type()) { - layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type())); - } - for (int i = 0; i < v1_layer_param.blobs_size(); ++i) { - layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i)); - } - for (int i = 0; i < v1_layer_param.param_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } - layer_param->mutable_param(i)->set_name(v1_layer_param.param(i)); - } - ParamSpec_DimCheckMode mode; - for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } - switch (v1_layer_param.blob_share_mode(i)) { - case V1LayerParameter_DimCheckMode_STRICT: - mode = ParamSpec_DimCheckMode_STRICT; - break; - case V1LayerParameter_DimCheckMode_PERMISSIVE: - mode = ParamSpec_DimCheckMode_PERMISSIVE; - break; - default: - LOG(FATAL) << "Unknown blob_share_mode: " - << v1_layer_param.blob_share_mode(i); - break; - } - layer_param->mutable_param(i)->set_share_mode(mode); - } - for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } - layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i)); - } - for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } - layer_param->mutable_param(i)->set_decay_mult( - v1_layer_param.weight_decay(i)); - } - for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) { - layer_param->add_loss_weight(v1_layer_param.loss_weight(i)); - } - if (v1_layer_param.has_accuracy_param()) { - layer_param->mutable_accuracy_param()->CopyFrom( - v1_layer_param.accuracy_param()); - } - if (v1_layer_param.has_argmax_param()) { - layer_param->mutable_argmax_param()->CopyFrom( - v1_layer_param.argmax_param()); - } - if (v1_layer_param.has_concat_param()) { - layer_param->mutable_concat_param()->CopyFrom( - v1_layer_param.concat_param()); - } - if (v1_layer_param.has_contrastive_loss_param()) { - layer_param->mutable_contrastive_loss_param()->CopyFrom( - v1_layer_param.contrastive_loss_param()); - } - if (v1_layer_param.has_convolution_param()) { - layer_param->mutable_convolution_param()->CopyFrom( - v1_layer_param.convolution_param()); - } - if (v1_layer_param.has_data_param()) { - layer_param->mutable_data_param()->CopyFrom( - v1_layer_param.data_param()); - } - if (v1_layer_param.has_dropout_param()) { - layer_param->mutable_dropout_param()->CopyFrom( - v1_layer_param.dropout_param()); - } - if (v1_layer_param.has_dummy_data_param()) { - layer_param->mutable_dummy_data_param()->CopyFrom( - v1_layer_param.dummy_data_param()); - } - if (v1_layer_param.has_eltwise_param()) { - layer_param->mutable_eltwise_param()->CopyFrom( - v1_layer_param.eltwise_param()); - } - if (v1_layer_param.has_exp_param()) { - layer_param->mutable_exp_param()->CopyFrom( - v1_layer_param.exp_param()); - } - if (v1_layer_param.has_hdf5_data_param()) { - layer_param->mutable_hdf5_data_param()->CopyFrom( - v1_layer_param.hdf5_data_param()); - } - if (v1_layer_param.has_hdf5_output_param()) { - layer_param->mutable_hdf5_output_param()->CopyFrom( - v1_layer_param.hdf5_output_param()); - } - if (v1_layer_param.has_hinge_loss_param()) { - layer_param->mutable_hinge_loss_param()->CopyFrom( - v1_layer_param.hinge_loss_param()); - } - if (v1_layer_param.has_image_data_param()) { - layer_param->mutable_image_data_param()->CopyFrom( - v1_layer_param.image_data_param()); - } - if (v1_layer_param.has_infogain_loss_param()) { - layer_param->mutable_infogain_loss_param()->CopyFrom( - v1_layer_param.infogain_loss_param()); - } - if (v1_layer_param.has_inner_product_param()) { - layer_param->mutable_inner_product_param()->CopyFrom( - v1_layer_param.inner_product_param()); - } - if (v1_layer_param.has_lrn_param()) { - layer_param->mutable_lrn_param()->CopyFrom( - v1_layer_param.lrn_param()); - } - if (v1_layer_param.has_memory_data_param()) { - layer_param->mutable_memory_data_param()->CopyFrom( - v1_layer_param.memory_data_param()); - } - if (v1_layer_param.has_mvn_param()) { - layer_param->mutable_mvn_param()->CopyFrom( - v1_layer_param.mvn_param()); - } - if (v1_layer_param.has_pooling_param()) { - layer_param->mutable_pooling_param()->CopyFrom( - v1_layer_param.pooling_param()); - } - if (v1_layer_param.has_power_param()) { - layer_param->mutable_power_param()->CopyFrom( - v1_layer_param.power_param()); - } - if (v1_layer_param.has_relu_param()) { - layer_param->mutable_relu_param()->CopyFrom( - v1_layer_param.relu_param()); - } - if (v1_layer_param.has_sigmoid_param()) { - layer_param->mutable_sigmoid_param()->CopyFrom( - v1_layer_param.sigmoid_param()); - } - if (v1_layer_param.has_softmax_param()) { - layer_param->mutable_softmax_param()->CopyFrom( - v1_layer_param.softmax_param()); - } - if (v1_layer_param.has_slice_param()) { - layer_param->mutable_slice_param()->CopyFrom( - v1_layer_param.slice_param()); - } - if (v1_layer_param.has_tanh_param()) { - layer_param->mutable_tanh_param()->CopyFrom( - v1_layer_param.tanh_param()); - } - if (v1_layer_param.has_threshold_param()) { - layer_param->mutable_threshold_param()->CopyFrom( - v1_layer_param.threshold_param()); - } - if (v1_layer_param.has_window_data_param()) { - layer_param->mutable_window_data_param()->CopyFrom( - v1_layer_param.window_data_param()); - } - if (v1_layer_param.has_transform_param()) { - layer_param->mutable_transform_param()->CopyFrom( - v1_layer_param.transform_param()); - } - if (v1_layer_param.has_loss_param()) { - layer_param->mutable_loss_param()->CopyFrom( - v1_layer_param.loss_param()); - } - if (v1_layer_param.has_layer()) { - LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; - is_fully_compatible = false; - } - return is_fully_compatible; + LayerParameter* layer_param) { + layer_param->Clear(); + bool is_fully_compatible = true; + for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { + layer_param->add_bottom(v1_layer_param.bottom(i)); + } + for (int i = 0; i < v1_layer_param.top_size(); ++i) { + layer_param->add_top(v1_layer_param.top(i)); + } + if (v1_layer_param.has_name()) { + layer_param->set_name(v1_layer_param.name()); + } + for (int i = 0; i < v1_layer_param.include_size(); ++i) { + layer_param->add_include()->CopyFrom(v1_layer_param.include(i)); + } + for (int i = 0; i < v1_layer_param.exclude_size(); ++i) { + layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i)); + } + if (v1_layer_param.has_type()) { + layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type())); + } + for (int i = 0; i < v1_layer_param.blobs_size(); ++i) { + layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i)); + } + for (int i = 0; i < v1_layer_param.param_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_name(v1_layer_param.param(i)); + } + ParamSpec_DimCheckMode mode; + for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + switch (v1_layer_param.blob_share_mode(i)) { + case V1LayerParameter_DimCheckMode_STRICT: + mode = ParamSpec_DimCheckMode_STRICT; + break; + case V1LayerParameter_DimCheckMode_PERMISSIVE: + mode = ParamSpec_DimCheckMode_PERMISSIVE; + break; + default: + LOG(FATAL) << "Unknown blob_share_mode: " + << v1_layer_param.blob_share_mode(i); + break; + } + layer_param->mutable_param(i)->set_share_mode(mode); + } + for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i)); + } + for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_decay_mult( + v1_layer_param.weight_decay(i)); + } + for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) { + layer_param->add_loss_weight(v1_layer_param.loss_weight(i)); + } + if (v1_layer_param.has_accuracy_param()) { + layer_param->mutable_accuracy_param()->CopyFrom( + v1_layer_param.accuracy_param()); + } + if (v1_layer_param.has_argmax_param()) { + layer_param->mutable_argmax_param()->CopyFrom( + v1_layer_param.argmax_param()); + } + if (v1_layer_param.has_concat_param()) { + layer_param->mutable_concat_param()->CopyFrom( + v1_layer_param.concat_param()); + } + if (v1_layer_param.has_contrastive_loss_param()) { + layer_param->mutable_contrastive_loss_param()->CopyFrom( + v1_layer_param.contrastive_loss_param()); + } + if (v1_layer_param.has_convolution_param()) { + layer_param->mutable_convolution_param()->CopyFrom( + v1_layer_param.convolution_param()); + } + if (v1_layer_param.has_data_param()) { + layer_param->mutable_data_param()->CopyFrom( + v1_layer_param.data_param()); + } + if (v1_layer_param.has_dropout_param()) { + layer_param->mutable_dropout_param()->CopyFrom( + v1_layer_param.dropout_param()); + } + if (v1_layer_param.has_dummy_data_param()) { + layer_param->mutable_dummy_data_param()->CopyFrom( + v1_layer_param.dummy_data_param()); + } + if (v1_layer_param.has_eltwise_param()) { + layer_param->mutable_eltwise_param()->CopyFrom( + v1_layer_param.eltwise_param()); + } + if (v1_layer_param.has_exp_param()) { + layer_param->mutable_exp_param()->CopyFrom( + v1_layer_param.exp_param()); + } + if (v1_layer_param.has_hdf5_data_param()) { + layer_param->mutable_hdf5_data_param()->CopyFrom( + v1_layer_param.hdf5_data_param()); + } + if (v1_layer_param.has_hdf5_output_param()) { + layer_param->mutable_hdf5_output_param()->CopyFrom( + v1_layer_param.hdf5_output_param()); + } + if (v1_layer_param.has_hinge_loss_param()) { + layer_param->mutable_hinge_loss_param()->CopyFrom( + v1_layer_param.hinge_loss_param()); + } + if (v1_layer_param.has_image_data_param()) { + layer_param->mutable_image_data_param()->CopyFrom( + v1_layer_param.image_data_param()); + } + if (v1_layer_param.has_infogain_loss_param()) { + layer_param->mutable_infogain_loss_param()->CopyFrom( + v1_layer_param.infogain_loss_param()); + } + if (v1_layer_param.has_inner_product_param()) { + layer_param->mutable_inner_product_param()->CopyFrom( + v1_layer_param.inner_product_param()); + } + if (v1_layer_param.has_lrn_param()) { + layer_param->mutable_lrn_param()->CopyFrom( + v1_layer_param.lrn_param()); + } + if (v1_layer_param.has_memory_data_param()) { + layer_param->mutable_memory_data_param()->CopyFrom( + v1_layer_param.memory_data_param()); + } + if (v1_layer_param.has_mvn_param()) { + layer_param->mutable_mvn_param()->CopyFrom( + v1_layer_param.mvn_param()); + } + if (v1_layer_param.has_pooling_param()) { + layer_param->mutable_pooling_param()->CopyFrom( + v1_layer_param.pooling_param()); + } + if (v1_layer_param.has_power_param()) { + layer_param->mutable_power_param()->CopyFrom( + v1_layer_param.power_param()); + } + if (v1_layer_param.has_relu_param()) { + layer_param->mutable_relu_param()->CopyFrom( + v1_layer_param.relu_param()); + } + if (v1_layer_param.has_sigmoid_param()) { + layer_param->mutable_sigmoid_param()->CopyFrom( + v1_layer_param.sigmoid_param()); + } + if (v1_layer_param.has_softmax_param()) { + layer_param->mutable_softmax_param()->CopyFrom( + v1_layer_param.softmax_param()); + } + if (v1_layer_param.has_slice_param()) { + layer_param->mutable_slice_param()->CopyFrom( + v1_layer_param.slice_param()); + } + if (v1_layer_param.has_tanh_param()) { + layer_param->mutable_tanh_param()->CopyFrom( + v1_layer_param.tanh_param()); + } + if (v1_layer_param.has_threshold_param()) { + layer_param->mutable_threshold_param()->CopyFrom( + v1_layer_param.threshold_param()); + } + if (v1_layer_param.has_window_data_param()) { + layer_param->mutable_window_data_param()->CopyFrom( + v1_layer_param.window_data_param()); + } + if (v1_layer_param.has_transform_param()) { + layer_param->mutable_transform_param()->CopyFrom( + v1_layer_param.transform_param()); + } + if (v1_layer_param.has_loss_param()) { + layer_param->mutable_loss_param()->CopyFrom( + v1_layer_param.loss_param()); + } + if (v1_layer_param.has_layer()) { + LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; + is_fully_compatible = false; + } + return is_fully_compatible; } const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) { - switch (type) { - case V1LayerParameter_LayerType_NONE: - return ""; - case V1LayerParameter_LayerType_ABSVAL: - return "AbsVal"; - case V1LayerParameter_LayerType_ACCURACY: - return "Accuracy"; - case V1LayerParameter_LayerType_ARGMAX: - return "ArgMax"; - case V1LayerParameter_LayerType_BNLL: - return "BNLL"; - case V1LayerParameter_LayerType_CONCAT: - return "Concat"; - case V1LayerParameter_LayerType_CONTRASTIVE_LOSS: - return "ContrastiveLoss"; - case V1LayerParameter_LayerType_CONVOLUTION: - return "Convolution"; - case V1LayerParameter_LayerType_DECONVOLUTION: - return "Deconvolution"; - case V1LayerParameter_LayerType_DATA: - return "Data"; - case V1LayerParameter_LayerType_DROPOUT: - return "Dropout"; - case V1LayerParameter_LayerType_DUMMY_DATA: - return "DummyData"; - case V1LayerParameter_LayerType_EUCLIDEAN_LOSS: - return "EuclideanLoss"; - case V1LayerParameter_LayerType_ELTWISE: - return "Eltwise"; - case V1LayerParameter_LayerType_EXP: - return "Exp"; - case V1LayerParameter_LayerType_FLATTEN: - return "Flatten"; - case V1LayerParameter_LayerType_HDF5_DATA: - return "HDF5Data"; - case V1LayerParameter_LayerType_HDF5_OUTPUT: - return "HDF5Output"; - case V1LayerParameter_LayerType_HINGE_LOSS: - return "HingeLoss"; - case V1LayerParameter_LayerType_IM2COL: - return "Im2col"; - case V1LayerParameter_LayerType_IMAGE_DATA: - return "ImageData"; - case V1LayerParameter_LayerType_INFOGAIN_LOSS: - return "InfogainLoss"; - case V1LayerParameter_LayerType_INNER_PRODUCT: - return "InnerProduct"; - case V1LayerParameter_LayerType_LRN: - return "LRN"; - case V1LayerParameter_LayerType_MEMORY_DATA: - return "MemoryData"; - case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS: - return "MultinomialLogisticLoss"; - case V1LayerParameter_LayerType_MVN: - return "MVN"; - case V1LayerParameter_LayerType_POOLING: - return "Pooling"; - case V1LayerParameter_LayerType_POWER: - return "Power"; - case V1LayerParameter_LayerType_RELU: - return "ReLU"; - case V1LayerParameter_LayerType_SIGMOID: - return "Sigmoid"; - case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS: - return "SigmoidCrossEntropyLoss"; - case V1LayerParameter_LayerType_SILENCE: - return "Silence"; - case V1LayerParameter_LayerType_SOFTMAX: - return "Softmax"; - case V1LayerParameter_LayerType_SOFTMAX_LOSS: - return "SoftmaxWithLoss"; - case V1LayerParameter_LayerType_SPLIT: - return "Split"; - case V1LayerParameter_LayerType_SLICE: - return "Slice"; - case V1LayerParameter_LayerType_TANH: - return "TanH"; - case V1LayerParameter_LayerType_WINDOW_DATA: - return "WindowData"; - case V1LayerParameter_LayerType_THRESHOLD: - return "Threshold"; - default: - LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type; - return ""; - } + switch (type) { + case V1LayerParameter_LayerType_NONE: + return ""; + case V1LayerParameter_LayerType_ABSVAL: + return "AbsVal"; + case V1LayerParameter_LayerType_ACCURACY: + return "Accuracy"; + case V1LayerParameter_LayerType_ARGMAX: + return "ArgMax"; + case V1LayerParameter_LayerType_BNLL: + return "BNLL"; + case V1LayerParameter_LayerType_CONCAT: + return "Concat"; + case V1LayerParameter_LayerType_CONTRASTIVE_LOSS: + return "ContrastiveLoss"; + case V1LayerParameter_LayerType_CONVOLUTION: + return "Convolution"; + case V1LayerParameter_LayerType_DECONVOLUTION: + return "Deconvolution"; + case V1LayerParameter_LayerType_DATA: + return "Data"; + case V1LayerParameter_LayerType_DROPOUT: + return "Dropout"; + case V1LayerParameter_LayerType_DUMMY_DATA: + return "DummyData"; + case V1LayerParameter_LayerType_EUCLIDEAN_LOSS: + return "EuclideanLoss"; + case V1LayerParameter_LayerType_ELTWISE: + return "Eltwise"; + case V1LayerParameter_LayerType_EXP: + return "Exp"; + case V1LayerParameter_LayerType_FLATTEN: + return "Flatten"; + case V1LayerParameter_LayerType_HDF5_DATA: + return "HDF5Data"; + case V1LayerParameter_LayerType_HDF5_OUTPUT: + return "HDF5Output"; + case V1LayerParameter_LayerType_HINGE_LOSS: + return "HingeLoss"; + case V1LayerParameter_LayerType_IM2COL: + return "Im2col"; + case V1LayerParameter_LayerType_IMAGE_DATA: + return "ImageData"; + case V1LayerParameter_LayerType_INFOGAIN_LOSS: + return "InfogainLoss"; + case V1LayerParameter_LayerType_INNER_PRODUCT: + return "InnerProduct"; + case V1LayerParameter_LayerType_LRN: + return "LRN"; + case V1LayerParameter_LayerType_MEMORY_DATA: + return "MemoryData"; + case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS: + return "MultinomialLogisticLoss"; + case V1LayerParameter_LayerType_MVN: + return "MVN"; + case V1LayerParameter_LayerType_POOLING: + return "Pooling"; + case V1LayerParameter_LayerType_POWER: + return "Power"; + case V1LayerParameter_LayerType_RELU: + return "ReLU"; + case V1LayerParameter_LayerType_SIGMOID: + return "Sigmoid"; + case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS: + return "SigmoidCrossEntropyLoss"; + case V1LayerParameter_LayerType_SILENCE: + return "Silence"; + case V1LayerParameter_LayerType_SOFTMAX: + return "Softmax"; + case V1LayerParameter_LayerType_SOFTMAX_LOSS: + return "SoftmaxWithLoss"; + case V1LayerParameter_LayerType_SPLIT: + return "Split"; + case V1LayerParameter_LayerType_SLICE: + return "Slice"; + case V1LayerParameter_LayerType_TANH: + return "TanH"; + case V1LayerParameter_LayerType_WINDOW_DATA: + return "WindowData"; + case V1LayerParameter_LayerType_THRESHOLD: + return "Threshold"; + default: + LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type; + return ""; + } } void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param) { - CHECK(ReadProtoFromTextFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; - UpgradeNetAsNeeded(param_file, param); + NetParameter* param) { + CHECK(ReadProtoFromTextFile(param_file, param)) + << "Failed to parse NetParameter file: " << param_file; + UpgradeNetAsNeeded(param_file, param); } void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param) { - CHECK(ReadProtoFromBinaryFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; - UpgradeNetAsNeeded(param_file, param); + NetParameter* param) { + CHECK(ReadProtoFromBinaryFile(param_file, param)) + << "Failed to parse NetParameter file: " << param_file; + UpgradeNetAsNeeded(param_file, param); } } // namespace caffe From f931d4cac3d78a859518b77abb0e598e5a0bfe0e Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 9 Sep 2015 18:38:06 +0800 Subject: [PATCH 070/124] Pass concat_layer & spp_layer; remove kernels in lrn_layer --- include/caffe/util/ocl_wrapper.hpp | 72 ++- include/caffe/vision_layers.hpp | 144 +++--- src/caffe/layers/lrn_layer.cpp | 90 ++-- src/caffe/layers/softmax_layer.cpp | 2 +- src/caffe/ocl/concat_layer.cl | 48 +- src/caffe/ocl/lrn_layer.cl | 12 +- src/caffe/ocl/pooling_layer.cl | 24 +- src/caffe/util/ocl_wrapper.cpp | 785 +++++++++++++---------------- 8 files changed, 547 insertions(+), 630 deletions(-) diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index c4149789..dbd712ea 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -286,44 +286,40 @@ void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); -template -void LRNFillScale(cl_kernel LFSkernel, const int nthreads, - const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale); - -template -void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out); - -template -void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff); -template -void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); - -template -void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); - -template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); - -template -void BNLLBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype *bottom_diff); - -template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, - const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype *out_data); - -template +template +void LRNFillScale(const int nthreads, const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale); + +template +void LRNComputeOutput(int nthreads, const Dtype* in, + Dtype* scale, Dtype negative_beta, Dtype* out); + +template +void LRNComputeDiff(const int nthreads, + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff); +template +void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y); + +template +void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data); + +template void CLLBackward(const int count, const int channels, const Dtype margin, const bool legacy_version, const Dtype alpha, const Dtype* y, const Dtype* diff, const Dtype* dist_sq, diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 9b718bd8..eb959190 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -384,83 +384,75 @@ template class SplitLayer; * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template -class LRNLayer: public Layer { - public: - explicit LRNLayer(const LayerParameter& param) - : Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "LRN"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - virtual void CrossChannelForward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void WithinChannelForward(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelBackward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void WithinChannelBackward(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int size_; - int pre_pad_; - Dtype alpha_; - Dtype beta_; - Dtype k_; - int num_; - int channels_; - int height_; - int width_; - - // Fields used for normalization ACROSS_CHANNELS - // scale_ stores the intermediate summing results - Blob scale_; +template +class LRNLayer : public Layer { + public: + explicit LRNLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "LRN"; } + virtual inline int ExactNumBottomBlobs() const { return 1; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + virtual void CrossChannelForward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void WithinChannelForward(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int size_; + int pre_pad_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int num_; + int channels_; + int height_; + int width_; + + // Fields used for normalization ACROSS_CHANNELS + // scale_ stores the intermediate summing results + Blob scale_; + + // Fields used for normalization WITHIN_CHANNEL + shared_ptr > split_layer_; + vector*> split_top_vec_; + shared_ptr > square_layer_; + Blob square_input_; + Blob square_output_; + vector*> square_bottom_vec_; + vector*> square_top_vec_; + shared_ptr > pool_layer_; + Blob pool_output_; + vector*> pool_top_vec_; + shared_ptr > power_layer_; + Blob power_output_; + vector*> power_top_vec_; + shared_ptr > product_layer_; + Blob product_input_; + vector*> product_bottom_vec_; - // Fields used for normalization WITHIN_CHANNEL - shared_ptr > split_layer_; - vector*> split_top_vec_; - shared_ptr > square_layer_; - Blob square_input_; - Blob square_output_; - vector*> square_bottom_vec_; - vector*> square_top_vec_; - shared_ptr > pool_layer_; - Blob pool_output_; - vector*> pool_top_vec_; - shared_ptr > power_layer_; - Blob power_output_; - vector*> power_top_vec_; - shared_ptr > product_layer_; - Blob product_input_; - vector*> product_bottom_vec_; - - cl_kernel LFSkernel, LCDkernel, LCOkernel; }; /*n diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 0f936f22..58f835b6 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -70,29 +70,26 @@ void LRNLayer::LayerSetUp(const vector*>& bottom, template void LRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - top[0]->Reshape(num_, channels_, height_, width_); - scale_.Reshape(num_, channels_, height_, width_); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - split_layer_->Reshape(bottom, split_top_vec_); - square_layer_->Reshape(square_bottom_vec_, square_top_vec_); - pool_layer_->Reshape(square_top_vec_, pool_top_vec_); - power_layer_->Reshape(pool_top_vec_, power_top_vec_); - product_layer_->Reshape(product_bottom_vec_, top); - break; - } - LFSkernel = clCreateKernel(amdDevice.Program, "LRNFillScalefloat", NULL); - LCDkernel = clCreateKernel(amdDevice.Program, "LRNComputeDifffloat", NULL); - LCOkernel = clCreateKernel(amdDevice.Program, "LRNComputeOutputfloat", NULL); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + top[0]->Reshape(num_, channels_, height_, width_); + scale_.Reshape(num_, channels_, height_, width_); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + split_layer_->Reshape(bottom, split_top_vec_); + square_layer_->Reshape(square_bottom_vec_, square_top_vec_); + pool_layer_->Reshape(square_top_vec_, pool_top_vec_); + power_layer_->Reshape(pool_top_vec_, power_top_vec_); + product_layer_->Reshape(product_bottom_vec_, top); + break; + } } template @@ -254,35 +251,32 @@ void LRNLayer::WithinChannelBackward( template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale(LFSkernel, - n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput(LCOkernel, - n_threads, bottom_data, scale_data, -beta_, top_data); + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); } template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff(LCDkernel, - n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); } template diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 117a966f..24d1e4b8 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -1,4 +1,4 @@ -s#include +#include #include #include "caffe/layer.hpp" diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl index de504dec..2c2c76ee 100644 --- a/src/caffe/ocl/concat_layer.cl +++ b/src/caffe/ocl/concat_layer.cl @@ -26,29 +26,29 @@ template __kernel void Concat(const int nthreads, __global const T* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global T* out_data) { - int index = get_global_id(0); - if(index < nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward == 1) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } } -template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global float* out_data); -template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global double* out_data); +template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); +template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl index 620bad72..1a53f772 100644 --- a/src/caffe/ocl/lrn_layer.cl +++ b/src/caffe/ocl/lrn_layer.cl @@ -31,8 +31,8 @@ __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* s for(index; index < nthreads; index += tmp) out[index] = in[index] * pow(scale[index], negative_beta); } -template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); -template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); +template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); +template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); template __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { @@ -78,8 +78,8 @@ __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, co } } } -template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); -template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); +template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); template __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { @@ -135,5 +135,5 @@ __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __glob } } -template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); -template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiff_double))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 11352e16..3162b92e 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -279,15 +279,15 @@ __kernel void StoPoolBackward(const int nthreads, } } -template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* bottom_diff); -template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index f7cf9c07..be0c5894 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1338,431 +1338,366 @@ template void TanHBackward(const int count, const double* top_diff, template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, - const int optnum) { - std::string kernel_name = "opttrans" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int num_kernels = channels * height * width * optnum; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} - -template void opttrans(const float* data_im, const int im_offset, - const int channels, - const int height, const int width, float* data_opt, const int opt_offset, - const int optnum); -template void opttrans(const double* data_im, const int im_offset, - const int channels, - const int height, const int width, double* data_opt, const int opt_offset, - const int optnum); - -template -void LRNFillScale(cl_kernel LFSkernel, const int nthreads, - const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale) { - cl_int ret; - ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); - ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); - ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); - ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); - ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} -template void LRNFillScale(cl_kernel kernel, const int nthreads, - const float* const in, - const int num, const int channels, const int height, - const int width, const int size, const float alpha_over_size, - const float k, float* const scale); -template void LRNFillScale(cl_kernel kernel, const int nthreads, - const double* const in, - const int num, const int channels, const int height, - const int width, const int size, const double alpha_over_size, - const double k, double* const scale); - -template -void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out) { - cl_int ret; - ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); - ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); - ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); - ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size2[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, - uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); -} -template void LRNComputeOutput(cl_kernel kernel, int nthreads, - const float* in, - float* scale, float negative_beta, float* out); -template void LRNComputeOutput(cl_kernel kernel, int nthreads, - const double* in, - double* scale, double negative_beta, double* out); - -template -void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff) { - cl_int ret; - ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); - ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); - ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); - ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); - ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} -template void LRNComputeDiff(cl_kernel kernel, const int nthreads, - const float* const bottom_data, const float* const top_data, - const float* const scale, const float* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const float negative_beta, - const float cache_ratio, float* const bottom_diff); -template void LRNComputeDiff(cl_kernel kernel, const int nthreads, - const double* const bottom_data, const double* const top_data, - const double* const scale, const double* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const double negative_beta, - const double cache_ratio, double* const bottom_diff); - -template -void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { - std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add(const int n, const float* in1, - const float* in2, float* y); -template void caffe_gpu_add(const int n, const double* in1, - const double* in2, double* y); - -template -void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { - std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) N }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); -template void caffe_gpu_sign_ocl(const int N, const double* X, - double* Y); - -template -void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { - std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) N }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y); -template void caffe_gpu_abs_ocl(const int N, const double* X, - double* Y); - -template -void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) { - std::string kernel_name = "div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_div(const int n, const float* a, const float* b, - float* y); -template void caffe_gpu_div(const int n, const double* a, - const double* b, double* y); - -template -void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) { - std::string kernel_name = "add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add_scalar(const int n, const float alpha, - float* top_data); -template void caffe_gpu_add_scalar(const int n, const double alpha, - double* top_data); - -template -void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) { - std::string kernel_name = "element_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_mul(const int n, const float* a, const float* b, - float* y); -template void caffe_gpu_mul(const int n, const double* a, - const double* b, double* y); - -template -void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { - std::string kernel_name = "powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_powx(const int n, const float* a, - const float alpha, float* y); -template void caffe_gpu_powx(const int n, const double* a, - const double alpha, double* y); - -template -void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype* top_data) - { - std::string kernel_name = "DropoutForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void DropoutForward(const int count, const float* bottom_data, - const int* MaskMem, const float scale_, float* top_data); -template void DropoutForward(const int count, const double* bottom_data, - const int* MaskMem, const double scale_, double* top_data); - -template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff) - { - std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); - ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void DropoutBackward(const int count, const float* top_diff, - const int* MaskMem, const float threshold_, const float scale_, - float* bottom_diff); -template void DropoutBackward(const int count, const double* top_diff, - const int* MaskMem, const float threshold_, const double scale_, - double* bottom_diff); - -template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) - { - std::string kernel_name = "BNLLForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void BNLLForward(const int count, const float* bottom_data, - float *top_data); -template void BNLLForward(const int count, const double* bottom_data, - double *top_data); - -template -void BNLLBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype *bottom_diff) - { - std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void BNLLBackward(const int count, const float* top_diff, - const float* bottom_data, float *bottom_diff); -template void BNLLBackward(const int count, const double* top_diff, - const double* bottom_data, double *bottom_diff); - -template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, - const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype *out_data) - { - std::string kernel_name = "Concat" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*) &forward); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) nthreads }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} -template void Concat(const int nthreads, const float* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, float *out_data); -template void Concat(const int nthreads, const double* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, double *out_data); - -template + const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int num_kernels = channels * height * width * optnum; + + cl_int ret; + ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); + ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); + ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); + ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); + ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); + ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; + size_t uiLocal_Work_Size[] = {256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); +} + +template void opttrans(const float* data_im, const int im_offset, const int channels, + const int height, const int width, float* data_opt, const int opt_offset, const int optnum); +template void opttrans(const double* data_im, const int im_offset, const int channels, + const int height, const int width, double* data_opt, const int opt_offset, const int optnum); + +template +void LRNFillScale(const int nthreads, const Dtype* const in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale){ + std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); + cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in); + ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num); + ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size); + ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size); + ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k); + ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[]={(size_t)nthreads}; + size_t uiLocal_Work_Size[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) ); +} +template void LRNFillScale(const int nthreads, const float* const in, + const int num, const int channels, const int height, + const int width, const int size, const float alpha_over_size, + const float k, float* const scale); +template void LRNFillScale(const int nthreads, const double* const in, + const int num, const int channels, const int height, + const int width, const int size, const double alpha_over_size, + const double k, double* const scale); + +template +void LRNComputeOutput(int nthreads, const Dtype* in, + Dtype* scale, Dtype negative_beta, Dtype* out){ + std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); + cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); + ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale); + ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta); + ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[]={(size_t)nthreads}; + size_t uiLocal_Work_Size2[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); +} +template void LRNComputeOutput(int nthreads, const float* in, + float* scale, float negative_beta, float* out); +template void LRNComputeOutput(int nthreads, const double* in, + double* scale, double negative_beta, double* out); + +template +void LRNComputeDiff(const int nthreads, + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff){ + std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); + cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads); + ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data); + ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data); + ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale); + ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff); + ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num); + ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels); + ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height); + ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width); + ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size); + ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta); + ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio); + ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[]={(size_t)nthreads}; + size_t uiLocal_Work_Size[]={256}; + OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) ); +} +template void LRNComputeDiff(const int nthreads, + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const float negative_beta, + const float cache_ratio, float* const bottom_diff); +template void LRNComputeDiff(const int nthreads, + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const double negative_beta, + const double cache_ratio, double* const bottom_diff); + +template +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add (const int n, const float* in1, const float* in2, float* y); +template void caffe_gpu_add (const int n, const double* in1, const double* in2, double* y); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ){ + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)N}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y ); +template void caffe_gpu_sign_ocl(const int N, const double* X, double* Y ); + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ){ + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)N}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y ); +template void caffe_gpu_abs_ocl(const int N, const double* X, double* Y ); + +template +void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ + std::string kernel_name = "div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_div (const int n, const float* a, const float* b, float* y); +template void caffe_gpu_div (const int n, const double* a, const double* b, double* y); + +template +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ + std::string kernel_name = "add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add_scalar (const int n, const float alpha, float* top_data); +template void caffe_gpu_add_scalar (const int n, const double alpha, double* top_data); + +template +void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ + std::string kernel_name = "element_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul (const int n, const float* a, const float* b, float* y); +template void caffe_gpu_mul (const int n, const double* a, const double* b, double* y); + +template +void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ + std::string kernel_name = "powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = {(size_t)n}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx (const int n, const float* a, const float alpha, float* y); +template void caffe_gpu_powx (const int n, const double* a, const double alpha, double* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) +{ + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); + ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); + ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); + ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_); + ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void DropoutForward(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); +template void DropoutForward(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) +{ + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); + ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); + ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_); + ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); +template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); + + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) +{ + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLForward(const int count, const float* bottom_data, float *top_data); +template void BNLLForward(const int count, const double* bottom_data, double *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff) +{ + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); + ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); + ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&bottom_data); + ret |= clSetKernelArg(kernel,3,sizeof(cl_mem), (void*)&bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)count}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLBackward(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff); +template void BNLLBackward(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff); + + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data) +{ + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true)? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = {(size_t)nthreads}; + size_t Local_Work_Size[] = {256}; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Concat(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data); +template void Concat(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data); + +template void CLLBackward(const int count, const int channels, const Dtype margin, const bool legacy_version, const Dtype alpha, const Dtype* y, const Dtype* diff, const Dtype* dist_sq, From 432dd92135ae79c4625ad9f1fbf0bf3de1379478 Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 10 Sep 2015 00:01:30 +0800 Subject: [PATCH 071/124] Fix the bug that CPU mode cannot run --- src/caffe/common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index a6ea3a57..22e9059b 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -91,6 +91,7 @@ void* Caffe::RNG::generator() { Caffe::Caffe() { + amdDevice.Init(); cl_int err = clblasSetup(); if (err != CL_SUCCESS) { LOG(ERROR) << "clBLAS setup failed " << err; From e45e90082b4db3624f2be38a73e36d156b14ddc2 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 10 Sep 2015 01:58:56 +0800 Subject: [PATCH 072/124] update Readme and License file --- LICENSE | 6 ++++++ README.md | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/LICENSE b/LICENSE index d69d16f5..ca91d911 100644 --- a/LICENSE +++ b/LICENSE @@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. + +AMD license on the OpenCL parts + +AMD holds license for the OpenCL related code, kernels and optimizations. +AMD license is added to the file or part of the file that written by AMD. +For details, please see license declaration for individual file. diff --git a/README.md b/README.md index ebec286d..8b47341f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,18 @@ +#OpenCL caffe + +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. + +OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. + +#Design features +-All layers ported to OpenCL +-Passes unit test +-Performance improvement by batched sgemm implementation for conv layer +-User can choose optimal batch number depening on H/W and image size +-Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users +-Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 + + # Caffe Caffe is a deep learning framework made with expression, speed, and modularity in mind. From b14dac2d2f334074339c83571fbc44744889aef8 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 10 Sep 2015 01:58:56 +0800 Subject: [PATCH 073/124] update Readme and License file --- LICENSE | 6 ++++++ README.md | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/LICENSE b/LICENSE index d69d16f5..ca91d911 100644 --- a/LICENSE +++ b/LICENSE @@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. + +AMD license on the OpenCL parts + +AMD holds license for the OpenCL related code, kernels and optimizations. +AMD license is added to the file or part of the file that written by AMD. +For details, please see license declaration for individual file. diff --git a/README.md b/README.md index ebec286d..a3fd8497 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,23 @@ +#OpenCL caffe + +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. + +OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. + +#Design features +-All layers ported to OpenCL + +-Passes unit test + +-Performance improvement by batched sgemm implementation for conv layer + +-User can choose optimal batch number depening on H/W, image size and minibatch size + +-Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users + +-Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 + + # Caffe Caffe is a deep learning framework made with expression, speed, and modularity in mind. From b5792c317898554cbbc0dab99d8810c8ac55838d Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:25:52 -0700 Subject: [PATCH 074/124] Update README.md --- README.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a3fd8497..1bc5d0c6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. @@ -17,6 +17,26 @@ OpenCL is an open standard parallel programming language that is supported by mo -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 +Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need. + +#Performance + +We will keep updating the latest performance we could achieve in this section. + +Training speed (Model: AlexNet) +-AMD W9100 (5.2TFLOPS), 255 images per second +-AMD R9 Fury((5.2TFLOPS)), 231 images per second + +Recognition speed (Model: AlexNet) +-AMD W9100 (5.2TFLOPS), 590 images per second +-AMD R9 Fury((5.2TFLOPS)), 699 images per second + +#Wiki +For more information on how to install, use or contribute to this code base, please visit our wiki page: +https://github.com/amd/OpenCL-caffe/wiki + +#License and support +Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. # Caffe From 947aa9a0328205d0ac62906ed9975c886ade5c86 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:27:15 -0700 Subject: [PATCH 075/124] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1bc5d0c6..bf7b2dcd 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,12 @@ We will keep updating the latest performance we could achieve in this section. Training speed (Model: AlexNet) -AMD W9100 (5.2TFLOPS), 255 images per second + -AMD R9 Fury((5.2TFLOPS)), 231 images per second Recognition speed (Model: AlexNet) -AMD W9100 (5.2TFLOPS), 590 images per second + -AMD R9 Fury((5.2TFLOPS)), 699 images per second #Wiki From 51872ffba96f2c196b53e9bae33f6a8c5225a8eb Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:27:55 -0700 Subject: [PATCH 076/124] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index bf7b2dcd..27a8f87f 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only We will keep updating the latest performance we could achieve in this section. Training speed (Model: AlexNet) + -AMD W9100 (5.2TFLOPS), 255 images per second -AMD R9 Fury((5.2TFLOPS)), 231 images per second Recognition speed (Model: AlexNet) + -AMD W9100 (5.2TFLOPS), 590 images per second -AMD R9 Fury((5.2TFLOPS)), 699 images per second From af514ad3f8950ae941ba14fc4b926022c2104af8 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:28:29 -0700 Subject: [PATCH 077/124] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 27a8f87f..ef4ae50d 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,13 @@ Training speed (Model: AlexNet) -AMD W9100 (5.2TFLOPS), 255 images per second --AMD R9 Fury((5.2TFLOPS)), 231 images per second +-AMD R9 Fury((7.2TFLOPS)), 231 images per second Recognition speed (Model: AlexNet) -AMD W9100 (5.2TFLOPS), 590 images per second --AMD R9 Fury((5.2TFLOPS)), 699 images per second +-AMD R9 Fury((7.2TFLOPS)), 699 images per second #Wiki For more information on how to install, use or contribute to this code base, please visit our wiki page: From 49ecf7c6970252267b9331f2e77de7d5adaf5774 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:41:38 -0700 Subject: [PATCH 078/124] Update README.md --- README.md | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8b47341f..54884f21 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,46 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. #Design features --All layers ported to OpenCL --Passes unit test --Performance improvement by batched sgemm implementation for conv layer --User can choose optimal batch number depening on H/W and image size --Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users --Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 + -All layers ported to OpenCL + -Passes unit test + + -Performance improvement by batched sgemm implementation for conv layer + + -User can choose optimal batch number depening on H/W, image size and minibatch size + + -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users + + -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 + +Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need. + +#Performance + +We will keep updating the latest performance we could achieve in this section. + +* Training speed (Model: AlexNet) + + -AMD W9100 (5.2TFLOPS), 255 images per second + + -AMD R9 Fury((7.2TFLOPS)), 231 images per second + +* Recognition speed (Model: AlexNet) + + -AMD W9100 (5.2TFLOPS), 590 images per second + + -AMD R9 Fury((7.2TFLOPS)), 699 images per second + +#Wiki +For more information on how to install, use or contribute to this code base, please visit our wiki page: +https://github.com/amd/OpenCL-caffe/wiki + +#License and support +Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. # Caffe From dc1f82aee029e1864d596eaed3882830dd7aed0c Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:43:06 -0700 Subject: [PATCH 079/124] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 54884f21..5d2692a9 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,8 @@ https://github.com/amd/OpenCL-caffe/wiki #License and support Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. -# Caffe +# Oroginal Caffe information +## Caffe Caffe is a deep learning framework made with expression, speed, and modularity in mind. It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors. From 15e5dc500a1fbebb5d40acaecbec6c23b251feb5 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:45:01 -0700 Subject: [PATCH 080/124] Update README.md --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5d2692a9..c75d9e1d 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,17 @@ OpenCL is an open standard parallel programming language that is supported by mo #Design features -All layers ported to OpenCL - - -Passes unit test + + -Aligned with CAFFE’s latest code -Performance improvement by batched sgemm implementation for conv layer -User can choose optimal batch number depening on H/W, image size and minibatch size + -Passes unit test + + -OpenCL 2.0, 1.2 + -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 From 4036485a4458f774b5c25dee1be3e02d5577d11d Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 15:45:41 -0700 Subject: [PATCH 081/124] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c75d9e1d..8fadd98f 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ https://github.com/amd/OpenCL-caffe/wiki #License and support Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. -# Oroginal Caffe information +# Original Caffe information ## Caffe Caffe is a deep learning framework made with expression, speed, and modularity in mind. From 20b4a89f4895297a7eabe884230bbde2a7939707 Mon Sep 17 00:00:00 2001 From: Noplz Date: Thu, 10 Sep 2015 11:16:12 +0800 Subject: [PATCH 082/124] Adjust the code style --- include/caffe/blob.hpp | 28 +- include/caffe/common_layers.hpp | 214 ++- include/caffe/data_layers.hpp | 112 +- include/caffe/data_transformer.hpp | 6 +- include/caffe/device.hpp | 9 +- include/caffe/filler.hpp | 72 +- include/caffe/internal_thread.hpp | 3 +- include/caffe/layer.hpp | 75 +- include/caffe/layer_factory.hpp | 14 +- include/caffe/loss_layers.hpp | 124 +- include/caffe/net.hpp | 20 +- include/caffe/neuron_layers.hpp | 191 +- include/caffe/python_layer.hpp | 14 +- include/caffe/solver.hpp | 30 +- include/caffe/syncedmem.hpp | 10 +- include/caffe/util/cudnn.hpp | 26 +- include/caffe/util/db_leveldb.hpp | 11 +- include/caffe/util/db_lmdb.hpp | 13 +- include/caffe/util/im2col.hpp | 106 +- include/caffe/util/insert_splits.hpp | 8 +- include/caffe/util/io.hpp | 44 +- include/caffe/util/math_functions.hpp | 198 +- include/caffe/util/mkl_alternate.hpp | 8 +- include/caffe/util/ocl_util.hpp | 4 +- include/caffe/util/ocl_wrapper.hpp | 318 ++-- include/caffe/util/rng.hpp | 6 +- include/caffe/util/upgrade_proto.hpp | 10 +- include/caffe/vision_layers.hpp | 304 ++-- src/caffe/blob.cpp | 128 +- src/caffe/common.cpp | 14 +- src/caffe/data_transformer.cpp | 92 +- src/caffe/device.cpp | 127 +- src/caffe/internal_thread.cpp | 2 +- src/caffe/layer_factory.cpp | 18 +- src/caffe/layers/absval_layer.cpp | 22 +- src/caffe/layers/accuracy_layer.cpp | 34 +- src/caffe/layers/argmax_layer.cpp | 20 +- src/caffe/layers/base_conv_layer.cpp | 252 +-- src/caffe/layers/base_data_layer.cpp | 52 +- src/caffe/layers/bnll_layer.cpp | 26 +- src/caffe/layers/concat_layer.cpp | 48 +- src/caffe/layers/contrastive_loss_layer.cpp | 104 +- src/caffe/layers/conv_layer.cpp | 66 +- src/caffe/layers/data_layer.cpp | 14 +- src/caffe/layers/deconv_layer.cpp | 34 +- src/caffe/layers/dropout_layer.cpp | 43 +- src/caffe/layers/dummy_data_layer.cpp | 46 +- src/caffe/layers/eltwise_layer.cpp | 46 +- src/caffe/layers/euclidean_loss_layer.cpp | 58 +- src/caffe/layers/exp_layer.cpp | 24 +- src/caffe/layers/filter_layer.cpp | 48 +- src/caffe/layers/flatten_layer.cpp | 16 +- src/caffe/layers/hdf5_data_layer.cpp | 47 +- src/caffe/layers/hdf5_output_layer.cpp | 58 +- src/caffe/layers/hinge_loss_layer.cpp | 12 +- src/caffe/layers/im2col_layer.cpp | 68 +- src/caffe/layers/image_data_layer.cpp | 28 +- src/caffe/layers/infogain_loss_layer.cpp | 26 +- src/caffe/layers/inner_product_layer.cpp | 80 +- src/caffe/layers/log_layer.cpp | 28 +- src/caffe/layers/loss_layer.cpp | 10 +- src/caffe/layers/lrn_layer.cpp | 180 +- src/caffe/layers/memory_data_layer.cpp | 32 +- .../multinomial_logistic_loss_layer.cpp | 20 +- src/caffe/layers/mvn_layer.cpp | 138 +- src/caffe/layers/neuron_layer.cpp | 4 +- src/caffe/layers/pooling_layer.cpp | 118 +- src/caffe/layers/power_layer.cpp | 26 +- src/caffe/layers/prelu_layer.cpp | 56 +- src/caffe/layers/reduction_layer.cpp | 42 +- src/caffe/layers/relu_layer.cpp | 24 +- src/caffe/layers/reshape_layer.cpp | 32 +- .../sigmoid_cross_entropy_loss_layer.cpp | 32 +- src/caffe/layers/sigmoid_layer.cpp | 20 +- src/caffe/layers/silence_layer.cpp | 16 +- src/caffe/layers/slice_layer.cpp | 46 +- src/caffe/layers/softmax_layer.cpp | 53 +- src/caffe/layers/softmax_loss_layer.cpp | 52 +- src/caffe/layers/split_layer.cpp | 28 +- src/caffe/layers/spp_layer.cpp | 50 +- src/caffe/layers/tanh_layer.cpp | 18 +- src/caffe/layers/threshold_layer.cpp | 12 +- src/caffe/layers/window_data_layer.cpp | 101 +- src/caffe/net.cpp | 274 +-- src/caffe/ocl/bnll_layer.cl | 6 +- src/caffe/ocl/concat_layer.cl | 48 +- src/caffe/ocl/contrastive_loss_layer.cl | 18 +- src/caffe/ocl/eltwise_layer.cl | 18 +- src/caffe/ocl/im2col.cl | 48 +- src/caffe/ocl/lrn_layer.cl | 2 +- src/caffe/ocl/pooling_layer.cl | 42 +- src/caffe/ocl/prelu_layer.cl | 2 +- src/caffe/ocl/random.cl | 26 +- src/caffe/ocl/softmax_layer.cl | 42 +- src/caffe/ocl/softmaxwithloss_layer.cl | 44 +- src/caffe/solver.cpp | 274 +-- src/caffe/syncedmem.cpp | 29 +- src/caffe/util/benchmark.cpp | 11 +- src/caffe/util/db_leveldb.cpp | 2 +- src/caffe/util/im2col.cpp | 209 +-- src/caffe/util/im2col.cu | 76 +- src/caffe/util/insert_splits.cpp | 26 +- src/caffe/util/io.cpp | 52 +- src/caffe/util/math_functions.cpp | 596 +++--- src/caffe/util/math_functions.cu | 148 +- src/caffe/util/ocl_util.cpp | 26 +- src/caffe/util/ocl_wrapper.cpp | 1611 +++++++++-------- src/caffe/util/upgrade_proto.cpp | 224 +-- 108 files changed, 4399 insertions(+), 4224 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index e55ce8e6..26a75558 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -21,21 +21,22 @@ namespace caffe { * * TODO(dox): more thorough description. */ -template +template class Blob { public: Blob() - : data_(), diff_(), count_(0), capacity_(0) { + : + data_(), diff_(), count_(0), capacity_(0) { } /// @brief Deprecated; use Blob(const vector& shape). explicit Blob(const int num, const int channels, const int height, - const int width); + const int width); explicit Blob(const vector& shape); /// @brief Deprecated; use Reshape(const vector& shape). void Reshape(const int num, const int channels, const int height, - const int width); + const int width); /** * @brief Change the dimensions of the blob, allocating new memory if * necessary. @@ -125,11 +126,11 @@ class Blob { */ inline int CanonicalAxisIndex(int axis_index) const { CHECK_GE(axis_index, -num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); + << "axis " << axis_index << " out of range for " << num_axes() + << "-D Blob with shape " << shape_string(); CHECK_LT(axis_index, num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); + << "axis " << axis_index << " out of range for " << num_axes() + << "-D Blob with shape " << shape_string(); if (axis_index < 0) { return axis_index + num_axes(); } @@ -154,7 +155,7 @@ class Blob { } inline int LegacyShape(int index) const { CHECK_LE(num_axes(), 4) - << "Cannot use legacy accessors on Blobs with > 4 axes."; + << "Cannot use legacy accessors on Blobs with > 4 axes."; CHECK_LT(index, 4); CHECK_GE(index, -4); if (index >= num_axes() || index < -num_axes()) { @@ -167,7 +168,7 @@ class Blob { } inline int offset(const int n, const int c = 0, const int h = 0, - const int w = 0) const { + const int w = 0) const { CHECK_GE(n, 0); CHECK_LE(n, num()); CHECK_GE(channels(), 0); @@ -202,15 +203,15 @@ class Blob { * shape if necessary */ void CopyFrom(const Blob& source, bool copy_diff = false, - bool reshape = false); + bool reshape = false); inline Dtype data_at(const int n, const int c, const int h, - const int w) const { + const int w) const { return cpu_data()[offset(n, c, h, w)]; } inline Dtype diff_at(const int n, const int c, const int h, - const int w) const { + const int w) const { return cpu_diff()[offset(n, c, h, w)]; } @@ -282,7 +283,6 @@ class Blob { data_->set_data_layer(); diff_->set_data_layer(); } - ; bool ShapeEquals(const BlobProto& other); diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 879e84e7..d892b5b5 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -25,7 +25,7 @@ namespace caffe { * * NOTE: does not implement Backwards operation. */ -template +template class ArgMaxLayer: public Layer { public: /** @@ -37,12 +37,13 @@ class ArgMaxLayer: public Layer { * if set, output a vector of pairs (max_ind, max_val) for each image. */ explicit ArgMaxLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "ArgMax"; @@ -67,10 +68,11 @@ class ArgMaxLayer: public Layer { * @f$ (for @f$ K = 1 @f$). */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /// @brief Not implemented (non-differentiable function) virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { NOT_IMPLEMENTED; } bool out_max_val_; @@ -81,16 +83,17 @@ class ArgMaxLayer: public Layer { * @brief Takes at least two Blob%s and concatenates them along either the num * or channel dimension, outputting the result. */ -template +template class ConcatLayer: public Layer { public: explicit ConcatLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Concat"; @@ -120,9 +123,9 @@ class ConcatLayer: public Layer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the concatenate inputs. @@ -147,9 +150,9 @@ class ConcatLayer: public Layer { * @f$ */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int count_; int num_concats_; @@ -163,16 +166,17 @@ class ConcatLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class EltwiseLayer: public Layer { public: explicit EltwiseLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Eltwise"; @@ -186,13 +190,13 @@ class EltwiseLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); EltwiseParameter_EltwiseOp op_; vector coeffs_; @@ -207,16 +211,17 @@ class EltwiseLayer: public Layer { * the corresponding item has to be filtered, non-zero means that corresponding * item needs to stay). */ -template +template class FilterLayer: public Layer { public: explicit FilterLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Filter"; @@ -249,9 +254,9 @@ class FilterLayer: public Layer { * that haven't been filtered */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the forwarded inputs. @@ -263,9 +268,9 @@ class FilterLayer: public Layer { * gradient is copied */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool first_reshape_; vector indices_to_forward_; @@ -281,14 +286,15 @@ class FilterLayer: public Layer { * and in Backward, the diff pointer of the bottom Blob to that of the top Blob * (see Blob::ShareDiff). */ -template +template class FlattenLayer: public Layer { public: explicit FlattenLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Flatten"; @@ -310,7 +316,7 @@ class FlattenLayer: public Layer { * the outputs -- i.e., the (virtually) copied, flattened inputs */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the concatenate inputs. @@ -322,7 +328,7 @@ class FlattenLayer: public Layer { * gradient is (virtually) copied */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -331,16 +337,17 @@ class FlattenLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class InnerProductLayer: public Layer { public: explicit InnerProductLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "InnerProduct"; @@ -354,13 +361,13 @@ class InnerProductLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int M_; int K_; @@ -374,14 +381,15 @@ class InnerProductLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class MVNLayer: public Layer { public: explicit MVNLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "MVN"; @@ -395,13 +403,13 @@ class MVNLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Blob mean_, variance_, temp_; @@ -416,16 +424,17 @@ class MVNLayer: public Layer { * Note: similarly to FlattenLayer, this layer does not change the input values * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff). */ -template +template class ReshapeLayer: public Layer { public: explicit ReshapeLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Reshape"; @@ -439,16 +448,18 @@ class ReshapeLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } /// @brief vector of axes indices whose dimensions we'll copy from the bottom @@ -466,16 +477,17 @@ class ReshapeLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class ReductionLayer: public Layer { public: explicit ReductionLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Reduction"; @@ -489,13 +501,13 @@ class ReductionLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); /// @brief the reduction operation performed by the layer ReductionParameter_ReductionOp op_; @@ -515,14 +527,15 @@ class ReductionLayer: public Layer { * @brief Ignores bottom blobs while producing no top blobs. (This is useful * to suppress outputs during testing.) */ -template +template class SilenceLayer: public Layer { public: explicit SilenceLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual inline const char* type() const { @@ -537,16 +550,16 @@ class SilenceLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } // We can't define Forward_gpu here, since STUB_GPU will provide // its own definition for CPU_ONLY mode. virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -554,15 +567,16 @@ class SilenceLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class SoftmaxLayer: public Layer { public: explicit SoftmaxLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } ~SoftmaxLayer(); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Softmax"; @@ -576,13 +590,13 @@ class SoftmaxLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int outer_num_; int inner_num_; @@ -604,16 +618,16 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { explicit CuDNNSoftmaxLayer(const LayerParameter& param) : SoftmaxLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNSoftmaxLayer(); protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t handle_; @@ -628,14 +642,15 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class SplitLayer: public Layer { public: explicit SplitLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Split"; @@ -649,13 +664,13 @@ class SplitLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int count_; cl_kernel gpu_add_kernel; @@ -667,16 +682,17 @@ class SplitLayer: public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class SliceLayer: public Layer { public: explicit SliceLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Slice"; @@ -690,13 +706,13 @@ class SliceLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int count_; int num_slices_; diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 442e4009..e93c4fe8 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -24,7 +24,7 @@ namespace caffe { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class BaseDataLayer: public Layer { public: explicit BaseDataLayer(const LayerParameter& param); @@ -32,20 +32,22 @@ class BaseDataLayer: public Layer { // DataLayerSetUp to do special data layer setup for individual layer types. // This method may not be overridden except by the BasePrefetchingDataLayer. virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } // Data layers have no bottoms, so reshaping is trivial. virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } protected: @@ -54,23 +56,24 @@ class BaseDataLayer: public Layer { bool output_labels_; }; -template +template class BasePrefetchingDataLayer: - public BaseDataLayer, public InternalThread { + public BaseDataLayer, public InternalThread { public: explicit BasePrefetchingDataLayer(const LayerParameter& param) - : BaseDataLayer(param) { + : + BaseDataLayer(param) { } // LayerSetUp: implements common data layer setup functionality, and calls // DataLayerSetUp to do special data layer setup for individual layer types. // This method may not be overridden. void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); @@ -84,15 +87,16 @@ class BasePrefetchingDataLayer: Blob transformed_data_; }; -template +template class DataLayer: public BasePrefetchingDataLayer { public: explicit DataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) { + : + BasePrefetchingDataLayer(param) { } virtual ~DataLayer(); virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Data"; @@ -119,17 +123,18 @@ class DataLayer: public BasePrefetchingDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class DummyDataLayer: public Layer { public: explicit DummyDataLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); // Data layers have no bottoms, so reshaping is trivial. virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual inline const char* type() const { @@ -144,12 +149,14 @@ class DummyDataLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } vector > > fillers_; @@ -161,18 +168,19 @@ class DummyDataLayer: public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class HDF5DataLayer: public Layer { public: explicit HDF5DataLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual ~HDF5DataLayer(); virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); // Data layers have no bottoms, so reshaping is trivial. virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual inline const char* type() const { @@ -187,14 +195,16 @@ class HDF5DataLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { } virtual void LoadHDF5FileData(const char* filename); @@ -212,18 +222,19 @@ class HDF5DataLayer: public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class HDF5OutputLayer: public Layer { public: explicit HDF5OutputLayer(const LayerParameter& param) - : Layer(param), file_opened_(false) { + : + Layer(param), file_opened_(false) { } virtual ~HDF5OutputLayer(); virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); // Data layers have no bottoms, so reshaping is trivial. virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } virtual inline const char* type() const { @@ -243,13 +254,13 @@ class HDF5OutputLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void SaveBlobs(); bool file_opened_; @@ -264,15 +275,16 @@ class HDF5OutputLayer: public Layer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class ImageDataLayer: public BasePrefetchingDataLayer { public: explicit ImageDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) { + : + BasePrefetchingDataLayer(param) { } virtual ~ImageDataLayer(); virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "ImageData"; @@ -298,14 +310,15 @@ class ImageDataLayer: public BasePrefetchingDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class MemoryDataLayer: public BaseDataLayer { public: explicit MemoryDataLayer(const LayerParameter& param) - : BaseDataLayer(param), has_new_data_(false) { + : + BaseDataLayer(param), has_new_data_(false) { } virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "MemoryData"; @@ -319,7 +332,7 @@ class MemoryDataLayer: public BaseDataLayer { virtual void AddDatumVector(const vector& datum_vector); virtual void AddMatVector(const vector& mat_vector, - const vector& labels); + const vector& labels); // Reset should accept const pointers, but can't, because the memory // will be given to Blob, which is mutable @@ -341,7 +354,7 @@ class MemoryDataLayer: public BaseDataLayer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); int batch_size_, channels_, height_, width_, size_; Dtype* data_; @@ -359,15 +372,16 @@ class MemoryDataLayer: public BaseDataLayer { * * TODO(dox): thorough documentation for Forward and proto params. */ -template +template class WindowDataLayer: public BasePrefetchingDataLayer { public: explicit WindowDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) { + : + BasePrefetchingDataLayer(param) { } virtual ~WindowDataLayer(); virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "WindowData"; diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp index 94c32366..c283a244 100644 --- a/include/caffe/data_transformer.hpp +++ b/include/caffe/data_transformer.hpp @@ -13,7 +13,7 @@ namespace caffe { * @brief Applies common transformations to the input data, such as * scaling, mirroring, substracting the image mean... */ -template +template class DataTransformer { public: explicit DataTransformer(const TransformationParameter& param, Phase phase); @@ -49,7 +49,7 @@ class DataTransformer { * set_cpu_data() is used. See memory_layer.cpp for an example. */ void Transform(const vector & datum_vector, - Blob* transformed_blob); + Blob* transformed_blob); /** * @brief Applies the transformation defined in the data layer's @@ -62,7 +62,7 @@ class DataTransformer { * set_cpu_data() is used. See memory_layer.cpp for an example. */ void Transform(const vector & mat_vector, - Blob* transformed_blob); + Blob* transformed_blob); /** * @brief Applies the transformation defined in the data layer's diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index c6cefedc..2d71b333 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -35,7 +35,8 @@ namespace caffe { class Device { public: Device() - : numPlatforms(0), numDevices(0), device_id(INT_MIN) { + : + numPlatforms(0), numDevices(0), device_id(INT_MIN) { } ~Device(); cl_uint numPlatforms; @@ -69,10 +70,10 @@ class Device { ; void BuildProgram(std::string kernel_dir); - template + template void DisplayDeviceInfo(cl_device_id id, cl_device_info name, - std::string str); - template + std::string str); + template void appendBitfield(T info, T value, std::string name, std::string &str); cl_kernel GetKernel(std::string kernel_name); diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 6c47d7aa..c431dc94 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -16,11 +16,12 @@ namespace caffe { /// @brief Fills a Blob with constant or randomly-generated data. -template +template class Filler { public: explicit Filler(const FillerParameter& param) - : filler_param_(param) { + : + filler_param_(param) { } virtual ~Filler() { } @@ -31,11 +32,12 @@ class Filler { // class Filler /// @brief Fills a Blob with constant values @f$ x = 0 @f$. -template +template class ConstantFiller: public Filler { public: explicit ConstantFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { Dtype* data = blob->mutable_cpu_data(); @@ -46,39 +48,41 @@ class ConstantFiller: public Filler { data[i] = value; } CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$. -template +template class UniformFiller: public Filler { public: explicit UniformFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { CHECK(blob->count()); caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$. -template +template class GaussianFiller: public Filler { public: explicit GaussianFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { Dtype* data = blob->mutable_cpu_data(); CHECK(blob->count()); caffe_rng_gaussian(blob->count(), - Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.mean()), + Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); int sparse = this->filler_param_.sparse(); CHECK_GE(sparse, -1); if (sparse >= 0) { @@ -105,11 +109,12 @@ class GaussianFiller: public Filler { /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$ * such that @f$ \forall i \sum_j x_{ij} = 1 @f$. */ -template +template class PositiveUnitballFiller: public Filler { public: explicit PositiveUnitballFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { Dtype* data = blob->mutable_cpu_data(); @@ -129,7 +134,7 @@ class PositiveUnitballFiller: public Filler { } } CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; @@ -149,11 +154,12 @@ class PositiveUnitballFiller: public Filler { * * TODO(dox): make notation in above comment consistent with rest & use LaTeX. */ -template +template class XavierFiller: public Filler { public: explicit XavierFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { CHECK(blob->count()); @@ -161,17 +167,17 @@ class XavierFiller: public Filler { int fan_out = blob->count() / blob->channels(); Dtype n = fan_in; // default to fan_in if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { + FillerParameter_VarianceNorm_AVERAGE) { n = (fan_in + fan_out) / Dtype(2); } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { + FillerParameter_VarianceNorm_FAN_OUT) { n = fan_out; } Dtype scale = sqrt(Dtype(3) / n); caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); + blob->mutable_cpu_data()); CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; @@ -192,11 +198,12 @@ class XavierFiller: public Filler { * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this * is currently not the case for inner product layers. */ -template +template class MSRAFiller: public Filler { public: explicit MSRAFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { CHECK(blob->count()); @@ -204,17 +211,17 @@ class MSRAFiller: public Filler { int fan_out = blob->count() / blob->channels(); Dtype n = fan_in; // default to fan_in if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { + FillerParameter_VarianceNorm_AVERAGE) { n = (fan_in + fan_out) / Dtype(2); } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { + FillerParameter_VarianceNorm_FAN_OUT) { n = fan_out; } Dtype std = sqrt(Dtype(2) / n); caffe_rng_gaussian(blob->count(), Dtype(0), std, - blob->mutable_cpu_data()); + blob->mutable_cpu_data()); CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; @@ -251,11 +258,12 @@ class MSRAFiller: public Filler { out = skimage.transform.rescale(img, factor, mode='constant', cval=0) \endcode */ -template +template class BilinearFiller: public Filler { public: explicit BilinearFiller(const FillerParameter& param) - : Filler(param) { + : + Filler(param) { } virtual void Fill(Blob* blob) { CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; @@ -269,7 +277,7 @@ class BilinearFiller: public Filler { data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); } CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; + << "Sparsity not supported by this Filler."; } }; @@ -279,7 +287,7 @@ class BilinearFiller: public Filler { * Ideally this would be replaced by a factory pattern, but we will leave it * this way for now. */ -template +template Filler* GetFiller(const FillerParameter& param) { const std::string& type = param.type(); if (type == "constant") { diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index 2df1806e..677deea4 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -21,7 +21,8 @@ namespace caffe { class InternalThread { public: InternalThread() - : thread_() { + : + thread_() { } virtual ~InternalThread(); diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index b01ea959..5651e814 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -23,7 +23,7 @@ namespace caffe { * gradients with respect to their input Blob%s, given the error gradients with * their output Blob%s. */ -template +template class Layer { public: /** @@ -32,7 +32,8 @@ class Layer { * layer. */ explicit Layer(const LayerParameter& param) - : layer_param_(param) { + : + layer_param_(param) { // Set phase and copy blobs (if there are any). phase_ = param.phase(); if (layer_param_.blobs_size() > 0) { @@ -60,7 +61,7 @@ class Layer { * This method may not be overridden. */ void SetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CheckBlobCounts(bottom, top); LayerSetUp(bottom, top); Reshape(bottom, top); @@ -84,7 +85,7 @@ class Layer { * adjust the top blob sizes. */ virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } /** @@ -100,7 +101,7 @@ class Layer { * accomodate the bottom blobs. */ virtual void Reshape(const vector*>& bottom, - const vector*>& top) = 0; + const vector*>& top) = 0; /** * @brief Given the bottom blobs, compute the top blobs and the loss. @@ -120,7 +121,7 @@ class Layer { * Your layer should implement Forward_cpu and (optionally) Forward_gpu. */ inline Dtype Forward(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Given the top blob error gradients, compute the bottom blob error @@ -144,8 +145,8 @@ class Layer { * Your layer should implement Backward_cpu and (optionally) Backward_gpu. */ inline void Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom); + const vector& propagate_down, + const vector*>& bottom); /** * @brief Returns the vector of learnable parameter blobs. @@ -294,7 +295,7 @@ class Layer { */ inline bool param_propagate_down(const int param_id) { return - (param_propagate_down_.size() > param_id) ? + (param_propagate_down_.size() > param_id) ? param_propagate_down_[param_id] : false; } /** @@ -324,13 +325,13 @@ class Layer { /** @brief Using the CPU device, compute the layer output. */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) = 0; + const vector*>& top) = 0; /** * @brief Using the GPU device, compute the layer output. * Fall back to Forward_cpu() if unavailable. */ virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // LOG(WARNING) << "Using CPU code as backup."; return Forward_cpu(bottom, top); } @@ -340,16 +341,16 @@ class Layer { * for the bottom blobs if propagate_down is true. */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) = 0; + const vector& propagate_down, + const vector*>& bottom) = 0; /** * @brief Using the GPU device, compute the gradients for any parameters and * for the bottom blobs if propagate_down is true. * Fall back to Backward_cpu() if unavailable. */ virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { // LOG(WARNING) << "Using CPU code as backup."; Backward_cpu(top, propagate_down, bottom); } @@ -360,41 +361,41 @@ class Layer { * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. */ virtual void CheckBlobCounts(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (ExactNumBottomBlobs() >= 0) { CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) - << type() << " Layer takes " << ExactNumBottomBlobs() - << " bottom blob(s) as input."; + << type() << " Layer takes " << ExactNumBottomBlobs() + << " bottom blob(s) as input."; } if (MinBottomBlobs() >= 0) { CHECK_LE(MinBottomBlobs(), bottom.size()) - << type() << " Layer takes at least " << MinBottomBlobs() - << " bottom blob(s) as input."; + << type() << " Layer takes at least " << MinBottomBlobs() + << " bottom blob(s) as input."; } if (MaxBottomBlobs() >= 0) { CHECK_GE(MaxBottomBlobs(), bottom.size()) - << type() << " Layer takes at most " << MaxBottomBlobs() - << " bottom blob(s) as input."; + << type() << " Layer takes at most " << MaxBottomBlobs() + << " bottom blob(s) as input."; } if (ExactNumTopBlobs() >= 0) { CHECK_EQ(ExactNumTopBlobs(), top.size()) - << type() << " Layer produces " << ExactNumTopBlobs() - << " top blob(s) as output."; + << type() << " Layer produces " << ExactNumTopBlobs() + << " top blob(s) as output."; } if (MinTopBlobs() >= 0) { CHECK_LE(MinTopBlobs(), top.size()) - << type() << " Layer produces at least " << MinTopBlobs() - << " top blob(s) as output."; + << type() << " Layer produces at least " << MinTopBlobs() + << " top blob(s) as output."; } if (MaxTopBlobs() >= 0) { CHECK_GE(MaxTopBlobs(), top.size()) - << type() << " Layer produces at most " << MaxTopBlobs() - << " top blob(s) as output."; + << type() << " Layer produces at most " << MaxTopBlobs() + << " top blob(s) as output."; } if (EqualNumBottomTopBlobs()) { CHECK_EQ(bottom.size(), top.size()) - << type() << " Layer produces one top blob as output for each " - << "bottom blob input."; + << type() << " Layer produces one top blob as output for each " + << "bottom blob input."; } } @@ -406,7 +407,7 @@ class Layer { const int num_loss_weights = layer_param_.loss_weight_size(); if (num_loss_weights) { CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " - "unspecified or specified once per top blob."; + "unspecified or specified once per top blob."; for (int top_id = 0; top_id < top.size(); ++top_id) { const Dtype loss_weight = layer_param_.loss_weight(top_id); if (loss_weight == Dtype(0)) { @@ -427,9 +428,9 @@ class Layer { // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these // functions. -template +template inline Dtype Layer::Forward(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype loss = 0; Reshape(bottom, top); switch (Caffe::mode()) { @@ -467,10 +468,10 @@ inline Dtype Layer::Forward(const vector*>& bottom, return loss; } -template +template inline void Layer::Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { switch (Caffe::mode()) { case Caffe::CPU: Backward_cpu(top, propagate_down, bottom); @@ -484,7 +485,7 @@ inline void Layer::Backward(const vector*>& top, } // Serialize LayerParameter to protocol buffer -template +template void Layer::ToProto(LayerParameter* param, bool write_diff) { param->Clear(); param->CopyFrom(layer_param_); diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp index e679ae6a..b64b9eb2 100644 --- a/include/caffe/layer_factory.hpp +++ b/include/caffe/layer_factory.hpp @@ -47,10 +47,10 @@ namespace caffe { -template +template class Layer; -template +template class LayerRegistry { public: typedef shared_ptr > (*Creator)(const LayerParameter&); @@ -65,7 +65,7 @@ class LayerRegistry { static void AddCreator(const string& type, Creator creator) { CreatorRegistry& registry = Registry(); CHECK_EQ(registry.count(type), 0) - << "Layer type " << type << " already registered."; + << "Layer type " << type << " already registered."; registry[type] = creator; } @@ -75,7 +75,7 @@ class LayerRegistry { const string& type = param.type(); CreatorRegistry& registry = Registry(); CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type - << " (known types: " << LayerTypeList() << ")"; + << " (known types: " << LayerTypeList() << ")"; return registry[type](param); } @@ -89,7 +89,7 @@ class LayerRegistry { CreatorRegistry& registry = Registry(); string layer_types; for (typename CreatorRegistry::iterator iter = registry.begin(); - iter != registry.end(); ++iter) { + iter != registry.end(); ++iter) { if (iter != registry.begin()) { layer_types += ", "; } @@ -99,11 +99,11 @@ class LayerRegistry { } }; -template +template class LayerRegisterer { public: LayerRegisterer(const string& type, - shared_ptr > (*creator)(const LayerParameter&)) { + shared_ptr > (*creator)(const LayerParameter&)) { // LOG(INFO) << "Registering layer type: " << type; LayerRegistry::AddCreator(type, creator); } diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 9e74ca85..766645b5 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -19,7 +19,7 @@ const float kLOG_THRESHOLD = 1e-20; * @brief Computes the classification accuracy for a one-of-many * classification task. */ -template +template class AccuracyLayer: public Layer { public: /** @@ -31,12 +31,13 @@ class AccuracyLayer: public Layer { * correct if the correct label is among the top 5 predicted labels. */ explicit AccuracyLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Accuracy"; @@ -74,11 +75,12 @@ class AccuracyLayer: public Layer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { for (int i = 0; i < propagate_down.size(); ++i) { if (propagate_down[i]) { NOT_IMPLEMENTED; @@ -104,16 +106,17 @@ class AccuracyLayer: public Layer { * LossLayers are typically only capable of backpropagating to their first input * -- the predictions. */ -template +template class LossLayer: public Layer { public: explicit LossLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp( - const vector*>& bottom, const vector*>& top); + const vector*>& bottom, const vector*>& top); virtual void Reshape( - const vector*>& bottom, const vector*>& top); + const vector*>& bottom, const vector*>& top); virtual inline int ExactNumBottomBlobs() const { return 2; @@ -164,14 +167,15 @@ class LossLayer: public Layer { * d = \left| \left| a_n - b_n \right| \right|_2^2 @f$. * This can be used to train siamese networks. */ -template +template class ContrastiveLossLayer: public LossLayer { public: explicit ContrastiveLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() { + : + LossLayer(param), diff_() { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline int ExactNumBottomBlobs() const { return 3; @@ -190,9 +194,9 @@ class ContrastiveLossLayer: public LossLayer { protected: /// @copydoc ContrastiveLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the Contrastive error gradient w.r.t. the inputs. @@ -220,9 +224,9 @@ class ContrastiveLossLayer: public LossLayer { * propagate_down[1] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Blob diff_; // cached for backward pass Blob dist_sq_; // cached for backward pass @@ -256,14 +260,15 @@ class ContrastiveLossLayer: public LossLayer { * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve * linear least squares problems! We use it only as an instructive example.) */ -template +template class EuclideanLossLayer: public LossLayer { public: explicit EuclideanLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() { + : + LossLayer(param), diff_() { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "EuclideanLoss"; @@ -279,9 +284,9 @@ class EuclideanLossLayer: public LossLayer { protected: /// @copydoc EuclideanLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the Euclidean error gradient w.r.t. the inputs. @@ -317,9 +322,9 @@ class EuclideanLossLayer: public LossLayer { * @f$ if propagate_down[1] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Blob diff_; }; @@ -367,11 +372,12 @@ class EuclideanLossLayer: public LossLayer { * outside the InnerProductLayer and no other losses outside the * HingeLossLayer). */ -template +template class HingeLossLayer: public LossLayer { public: explicit HingeLossLayer(const LayerParameter& param) - : LossLayer(param) { + : + LossLayer(param) { } virtual inline const char* type() const { @@ -381,7 +387,7 @@ class HingeLossLayer: public LossLayer { protected: /// @copydoc HingeLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the hinge loss error gradient w.r.t. the predictions. @@ -411,7 +417,7 @@ class HingeLossLayer: public LossLayer { * the labels -- ignored as we can't compute their error gradients */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -446,16 +452,17 @@ class HingeLossLayer: public LossLayer { * \log(\hat{p}_{n,k}) * @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$. */ -template +template class InfogainLossLayer: public LossLayer { public: explicit InfogainLossLayer(const LayerParameter& param) - : LossLayer(param), infogain_() { + : + LossLayer(param), infogain_() { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should // be the infogain matrix. (Otherwise the infogain matrix is loaded from a @@ -477,7 +484,7 @@ class InfogainLossLayer: public LossLayer { protected: /// @copydoc InfogainLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the infogain loss error gradient w.r.t. the predictions. @@ -512,7 +519,7 @@ class InfogainLossLayer: public LossLayer { * gradient computation is not implemented. */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Blob infogain_; }; @@ -546,14 +553,15 @@ class InfogainLossLayer: public LossLayer { * \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n}) * @f$ */ -template +template class MultinomialLogisticLossLayer: public LossLayer { public: explicit MultinomialLogisticLossLayer(const LayerParameter& param) - : LossLayer(param) { + : + LossLayer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "MultinomialLogisticLoss"; @@ -562,7 +570,7 @@ class MultinomialLogisticLossLayer: public LossLayer { protected: /// @copydoc MultinomialLogisticLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the multinomial logistic loss error gradient w.r.t. the @@ -593,7 +601,7 @@ class MultinomialLogisticLossLayer: public LossLayer { * the labels -- ignored as we can't compute their error gradients */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -625,18 +633,19 @@ class MultinomialLogisticLossLayer: public LossLayer { * \right] * @f$ */ -template +template class SigmoidCrossEntropyLossLayer: public LossLayer { public: explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) - : LossLayer(param), - sigmoid_layer_(new SigmoidLayer(param)), - sigmoid_output_(new Blob()) { + : + LossLayer(param), + sigmoid_layer_(new SigmoidLayer(param)), + sigmoid_output_(new Blob()) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; @@ -645,7 +654,7 @@ class SigmoidCrossEntropyLossLayer: public LossLayer { protected: /// @copydoc SigmoidCrossEntropyLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the @@ -678,9 +687,9 @@ class SigmoidCrossEntropyLossLayer: public LossLayer { * the labels -- ignored as we can't compute their error gradients */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); /// The internal SigmoidLayer used to map predictions to probabilities. shared_ptr > sigmoid_layer_; @@ -693,7 +702,7 @@ class SigmoidCrossEntropyLossLayer: public LossLayer { }; // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer. -template class SoftmaxLayer; +template class SoftmaxLayer; /** * @brief Computes the multinomial logistic loss for a one-of-many @@ -724,7 +733,7 @@ template class SoftmaxLayer; * \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n}) * @f$, for softmax output class probabilites @f$ \hat{p} @f$ */ -template +template class SoftmaxWithLossLayer: public LossLayer { public: /** @@ -736,13 +745,14 @@ class SoftmaxWithLossLayer: public LossLayer { * present; otherwise the loss is simply summed over spatial locations. */ explicit SoftmaxWithLossLayer(const LayerParameter& param) - : LossLayer(param) { + : + LossLayer(param) { } ~SoftmaxWithLossLayer(); virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "SoftmaxWithLoss"; @@ -760,9 +770,9 @@ class SoftmaxWithLossLayer: public LossLayer { protected: /// @copydoc SoftmaxWithLossLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the softmax loss error gradient w.r.t. the predictions. * @@ -791,9 +801,9 @@ class SoftmaxWithLossLayer: public LossLayer { * the labels -- ignored as we can't compute their error gradients */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); void ocl_setup(); /// The internal SoftmaxLayer used to map predictions to a distribution. diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 68e631a1..2fe273f5 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -20,7 +20,7 @@ namespace caffe { * * TODO(dox): more thorough description. */ -template +template class Net { public: explicit Net(const NetParameter& param); @@ -51,7 +51,7 @@ class Net { Dtype ForwardTo(int end); /// @brief Run forward using a set of bottom blobs, and return the result. const vector*>& Forward(const vector*> & bottom, - Dtype* loss = NULL); + Dtype* loss = NULL); /** * @brief Run forward using a serialized BlobProtoVector and return the * result as a serialized BlobProtoVector @@ -189,7 +189,7 @@ class Net { const shared_ptr > blob_by_name(const string& blob_name) const; bool has_layer(const string& layer_name) const; const shared_ptr > layer_by_name( - const string& layer_name) const; + const string& layer_name) const; void set_debug_info(const bool value) { debug_info_ = value; @@ -201,24 +201,24 @@ class Net { * phase, level, and stage. */ static void FilterNet(const NetParameter& param, - NetParameter* param_filtered); + NetParameter* param_filtered); /// @brief return whether NetState state meets NetStateRule rule static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, - const string& layer_name); + const string& layer_name); protected: // Helpers for Init. /// @brief Append a new input or top blob to the net. void AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx); + const int top_id, set* available_blobs, + map* blob_name_to_idx); /// @brief Append a new bottom blob to the net. int AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx); + const int bottom_id, set* available_blobs, + map* blob_name_to_idx); /// @brief Append a new parameter blob to the net. void AppendParam(const NetParameter& param, const int layer_id, - const int param_id); + const int param_id); /// @brief Helper for displaying debug info in Forward about input Blobs. void InputDebugInfo(const int layer_id); diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 5606ff65..89b6c481 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -22,14 +22,15 @@ namespace caffe { * each element of the output depends only on the corresponding input * element. */ -template +template class NeuronLayer: public Layer { public: explicit NeuronLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline int ExactNumBottomBlobs() const { return 1; @@ -49,14 +50,15 @@ class NeuronLayer: public Layer { * -# @f$ (N \times C \times H \times W) @f$ * the computed outputs @f$ y = |x| @f$ */ -template +template class AbsValLayer: public NeuronLayer { public: explicit AbsValLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "AbsVal"; @@ -71,9 +73,9 @@ class AbsValLayer: public NeuronLayer { protected: /// @copydoc AbsValLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the absolute value inputs. @@ -93,9 +95,9 @@ class AbsValLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -115,11 +117,12 @@ class AbsValLayer: public NeuronLayer { * \end{array} \right. * @f$ */ -template +template class BNLLLayer: public NeuronLayer { public: explicit BNLLLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual inline const char* type() const { @@ -129,9 +132,9 @@ class BNLLLayer: public NeuronLayer { protected: /// @copydoc BNLLLayer virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the BNLL inputs. @@ -150,9 +153,9 @@ class BNLLLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -166,7 +169,7 @@ class BNLLLayer: public NeuronLayer { * -# @f$ (N \times C \times H \times W) @f$ * the computed outputs @f$ y = |x| @f$ */ -template +template class DropoutLayer: public NeuronLayer { public: /** @@ -176,12 +179,13 @@ class DropoutLayer: public NeuronLayer { * Sets the probability @f$ p @f$ that any given unit is dropped. */ explicit DropoutLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Dropout"; @@ -211,13 +215,13 @@ class DropoutLayer: public NeuronLayer { * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ Blob rand_vec_; @@ -233,7 +237,7 @@ class DropoutLayer: public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and base @f$ \gamma @f$. */ -template +template class ExpLayer: public NeuronLayer { public: /** @@ -245,10 +249,11 @@ class ExpLayer: public NeuronLayer { * the base @f$ \gamma @f$ */ explicit ExpLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Exp"; @@ -266,9 +271,9 @@ class ExpLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the exp inputs. @@ -288,9 +293,9 @@ class ExpLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Dtype inner_scale_, outer_scale_; }; @@ -300,7 +305,7 @@ class ExpLayer: public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and base @f$ \gamma @f$. */ -template +template class LogLayer: public NeuronLayer { public: /** @@ -312,10 +317,11 @@ class LogLayer: public NeuronLayer { * the base @f$ \gamma @f$ */ explicit LogLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Log"; @@ -333,9 +339,9 @@ class LogLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the exp inputs. @@ -355,9 +361,9 @@ class LogLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); Dtype base_scale_; Dtype input_scale_, input_shift_; @@ -369,7 +375,7 @@ class LogLayer: public NeuronLayer { * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and power @f$ \gamma @f$. */ -template +template class PowerLayer: public NeuronLayer { public: /** @@ -380,10 +386,11 @@ class PowerLayer: public NeuronLayer { * - power (\b optional, default 1) the power @f$ \gamma @f$ */ explicit PowerLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Power"; @@ -401,9 +408,9 @@ class PowerLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the power inputs. @@ -426,9 +433,9 @@ class PowerLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); /// @brief @f$ \gamma @f$ from layer_param_.power_param() Dtype power_; @@ -444,7 +451,7 @@ class PowerLayer: public NeuronLayer { * @brief Rectified Linear Unit non-linearity @f$ y = \max(0, x) @f$. * The simple max is fast to compute, and the function does not saturate. */ -template +template class ReLULayer: public NeuronLayer { public: /** @@ -454,7 +461,8 @@ class ReLULayer: public NeuronLayer { * the value @f$ \nu @f$ by which negative values are multiplied. */ explicit ReLULayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual inline const char* type() const { return "ReLU"; @@ -473,9 +481,9 @@ class ReLULayer: public NeuronLayer { * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the ReLU inputs. @@ -506,9 +514,9 @@ class ReLULayer: public NeuronLayer { * @f$. */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -521,16 +529,16 @@ class CuDNNReLULayer : public ReLULayer { explicit CuDNNReLULayer(const LayerParameter& param) : ReLULayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNReLULayer(); protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t handle_; @@ -547,11 +555,12 @@ class CuDNNReLULayer : public ReLULayer { * Note that the gradient vanishes as the values move away from 0. * The ReLULayer is often a better choice for this reason. */ -template +template class SigmoidLayer: public NeuronLayer { public: explicit SigmoidLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual inline const char* type() const { @@ -570,9 +579,9 @@ class SigmoidLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the sigmoid inputs. @@ -592,9 +601,9 @@ class SigmoidLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -607,16 +616,16 @@ class CuDNNSigmoidLayer : public SigmoidLayer { explicit CuDNNSigmoidLayer(const LayerParameter& param) : SigmoidLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNSigmoidLayer(); protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t handle_; @@ -633,11 +642,12 @@ class CuDNNSigmoidLayer : public SigmoidLayer { * Note that the gradient vanishes as the values move away from 0. * The ReLULayer is often a better choice for this reason. */ -template +template class TanHLayer: public NeuronLayer { public: explicit TanHLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual inline const char* type() const { @@ -656,9 +666,9 @@ class TanHLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the sigmoid inputs. @@ -680,9 +690,9 @@ class TanHLayer: public NeuronLayer { * @f$ if propagate_down[0] */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -695,16 +705,16 @@ class CuDNNTanHLayer : public TanHLayer { explicit CuDNNTanHLayer(const LayerParameter& param) : TanHLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNTanHLayer(); protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t handle_; @@ -717,7 +727,7 @@ class CuDNNTanHLayer : public TanHLayer { * @brief Tests whether the input exceeds a threshold: outputs 1 for inputs * above threshold; 0 otherwise. */ -template +template class ThresholdLayer: public NeuronLayer { public: /** @@ -727,10 +737,11 @@ class ThresholdLayer: public NeuronLayer { * the threshold value @f$ t @f$ to which the input values are compared. */ explicit ThresholdLayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Threshold"; @@ -752,12 +763,13 @@ class ThresholdLayer: public NeuronLayer { * @f$ */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /// @brief Not implemented (non-differentiable function) virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { NOT_IMPLEMENTED; } @@ -772,7 +784,7 @@ class ThresholdLayer: public NeuronLayer { * channels. The number of axes of input blob should be greater than or * equal to 2. The 1st axis (0-based) is seen as channels. */ -template +template class PReLULayer: public NeuronLayer { public: /** @@ -784,14 +796,15 @@ class PReLULayer: public NeuronLayer { * negative slopes are shared across channels. */ explicit PReLULayer(const LayerParameter& param) - : NeuronLayer(param) { + : + NeuronLayer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "PReLU"; @@ -809,9 +822,9 @@ class PReLULayer: public NeuronLayer { * @f$. */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); /** * @brief Computes the error gradient w.r.t. the PReLU inputs. @@ -842,9 +855,9 @@ class PReLULayer: public NeuronLayer { * @f$. */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool channel_shared_; Blob multiplier_; // dot multiplier for backward computation of params diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp index 653f5e36..41e2c21a 100644 --- a/include/caffe/python_layer.hpp +++ b/include/caffe/python_layer.hpp @@ -10,15 +10,16 @@ namespace bp = boost::python; namespace caffe { -template +template class PythonLayer: public Layer { public: PythonLayer(PyObject* self, const LayerParameter& param) - : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { + : + Layer(param), self_(bp::handle<>(bp::borrowed(self))) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { try { self_.attr("setup")(bottom, top); } catch (bp::error_already_set) { @@ -28,7 +29,7 @@ class PythonLayer: public Layer { } virtual void Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { try { self_.attr("reshape")(bottom, top); } catch (bp::error_already_set) { @@ -43,7 +44,7 @@ class PythonLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { try { self_.attr("forward")(bottom, top); } catch (bp::error_already_set) { @@ -52,7 +53,8 @@ class PythonLayer: public Layer { } } virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { try { self_.attr("backward")(top, propagate_down, bottom); } catch (bp::error_already_set) { diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 688fb99f..60dbc5b0 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -14,7 +14,7 @@ namespace caffe { * Requires implementation of ApplyUpdate to compute a parameter update * given the current state of the Net parameters. */ -template +template class Solver { public: explicit Solver(const SolverParameter& param); @@ -78,15 +78,17 @@ class Solver { * @brief Optimizes the parameters of a Net using * stochastic gradient descent (SGD) with momentum. */ -template +template class SGDSolver: public Solver { public: explicit SGDSolver(const SolverParameter& param) - : Solver(param) { + : + Solver(param) { PreSolve(); } explicit SGDSolver(const string& param_file) - : Solver(param_file) { + : + Solver(param_file) { PreSolve(); } @@ -117,14 +119,16 @@ class SGDSolver: public Solver { DISABLE_COPY_AND_ASSIGN (SGDSolver); }; -template +template class NesterovSolver: public SGDSolver { public: explicit NesterovSolver(const SolverParameter& param) - : SGDSolver(param) { + : + SGDSolver(param) { } explicit NesterovSolver(const string& param_file) - : SGDSolver(param_file) { + : + SGDSolver(param_file) { } protected: @@ -137,15 +141,17 @@ class NesterovSolver: public SGDSolver { DISABLE_COPY_AND_ASSIGN (NesterovSolver); }; -template +template class AdaGradSolver: public SGDSolver { public: explicit AdaGradSolver(const SolverParameter& param) - : SGDSolver(param) { + : + SGDSolver(param) { constructor_sanity_check(); } explicit AdaGradSolver(const string& param_file) - : SGDSolver(param_file) { + : + SGDSolver(param_file) { constructor_sanity_check(); } @@ -153,7 +159,7 @@ class AdaGradSolver: public SGDSolver { virtual void ComputeUpdateValue(int param_id, Dtype rate); void constructor_sanity_check() { CHECK_EQ(0, this->param_.momentum()) - << "Momentum cannot be used with AdaGrad."; + << "Momentum cannot be used with AdaGrad."; } void ocl_setup(); @@ -162,7 +168,7 @@ class AdaGradSolver: public SGDSolver { DISABLE_COPY_AND_ASSIGN (AdaGradSolver); }; -template +template Solver* GetSolver(const SolverParameter& param) { SolverParameter_SolverType type = param.solver_type(); diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 0b053a48..1a16c04a 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -66,13 +66,15 @@ inline void CaffeFreeHost(void* ptr) { class SyncedMemory { public: SyncedMemory() - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { + : + cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), + own_cpu_data_(false), data_layer_(false) { ocl_setup(); } explicit SyncedMemory(size_t size) - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { + : + cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), + own_cpu_data_(false), data_layer_(false) { ocl_setup(); } diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp index 4acca743..1ff29356 100644 --- a/include/caffe/util/cudnn.hpp +++ b/include/caffe/util/cudnn.hpp @@ -67,29 +67,29 @@ namespace caffe { template inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w, - int stride_n, int stride_c, int stride_h, int stride_w) { + int n, int c, int h, int w, + int stride_n, int stride_c, int stride_h, int stride_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, - n, c, h, w, stride_n, stride_c, stride_h, stride_w)); + n, c, h, w, stride_n, stride_c, stride_h, stride_w)); } template inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w) { + int n, int c, int h, int w) { const int stride_w = 1; const int stride_h = w * stride_w; const int stride_c = h * stride_h; const int stride_n = c * stride_c; setTensor4dDesc(desc, n, c, h, w, - stride_n, stride_c, stride_h, stride_w); + stride_n, stride_c, stride_h, stride_w); } template inline void createFilterDesc(cudnnFilterDescriptor_t* desc, - int n, int c, int h, int w) { + int n, int c, int h, int w) { CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, - n, c, h, w)); + n, c, h, w)); } template @@ -99,16 +99,16 @@ namespace caffe { template inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, - cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, - int pad_h, int pad_w, int stride_h, int stride_w) { + cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, + int pad_h, int pad_w, int stride_h, int stride_w) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, - pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); + pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); } template inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, - PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, - int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { + PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, + int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { switch (poolmethod) { case PoolingParameter_PoolMethod_MAX: *mode = CUDNN_POOLING_MAX; @@ -121,7 +121,7 @@ namespace caffe { } CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, - pad_h, pad_w, stride_h, stride_w)); + pad_h, pad_w, stride_h, stride_w)); } } // namespace cudnn diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp index d3716de7..c63fdbb0 100644 --- a/include/caffe/util/db_leveldb.hpp +++ b/include/caffe/util/db_leveldb.hpp @@ -14,7 +14,8 @@ namespace db { class LevelDBCursor: public Cursor { public: explicit LevelDBCursor(leveldb::Iterator* iter) - : iter_(iter) { + : + iter_(iter) { SeekToFirst(); } ~LevelDBCursor() { @@ -43,7 +44,8 @@ class LevelDBCursor: public Cursor { class LevelDBTransaction: public Transaction { public: explicit LevelDBTransaction(leveldb::DB* db) - : db_(db) { + : + db_(db) { CHECK_NOTNULL(db_); } virtual void Put(const string& key, const string& value) { @@ -52,7 +54,7 @@ class LevelDBTransaction: public Transaction { virtual void Commit() { leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); CHECK(status.ok()) << "Failed to write batch to leveldb " - << std::endl << status.ToString(); + << std::endl << status.ToString(); } private: @@ -65,7 +67,8 @@ class LevelDBTransaction: public Transaction { class LevelDB: public DB { public: LevelDB() - : db_(NULL) { + : + db_(NULL) { } virtual ~LevelDB() { Close(); diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp index 06424c94..68cbb93a 100644 --- a/include/caffe/util/db_lmdb.hpp +++ b/include/caffe/util/db_lmdb.hpp @@ -17,7 +17,8 @@ inline void MDB_CHECK(int mdb_status) { class LMDBCursor: public Cursor { public: explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) - : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { + : + mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { SeekToFirst(); } virtual ~LMDBCursor() { @@ -32,11 +33,11 @@ class LMDBCursor: public Cursor { } virtual string key() { return string(static_cast(mdb_key_.mv_data), - mdb_key_.mv_size); + mdb_key_.mv_size); } virtual string value() { return string(static_cast(mdb_value_.mv_data), - mdb_value_.mv_size); + mdb_value_.mv_size); } virtual bool valid() { return valid_; @@ -62,7 +63,8 @@ class LMDBCursor: public Cursor { class LMDBTransaction: public Transaction { public: explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) - : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { + : + mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { } virtual void Put(const string& key, const string& value); virtual void Commit() { @@ -79,7 +81,8 @@ class LMDBTransaction: public Transaction { class LMDB: public DB { public: LMDB() - : mdb_env_(NULL) { + : + mdb_env_(NULL) { } virtual ~LMDB() { Close(); diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index fda13567..ee7ea10b 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -29,84 +29,84 @@ namespace caffe { -template +template void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); -template +template void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); -template +template void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset); -template +template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset); -template +template void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); -template +template void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); -template +template void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); -template +template void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset); -template +template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, int optnum); -template +template void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im, const int img_offset); + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im, const int img_offset); -template +template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, int optnum); -template +template void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, cl_kernel Kernel); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, cl_kernel Kernel); -template +template void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, cl_kernel Kernel); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, cl_kernel Kernel); } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp index 4c0d0106..c9a40c54 100644 --- a/include/caffe/util/insert_splits.hpp +++ b/include/caffe/util/insert_splits.hpp @@ -12,14 +12,14 @@ namespace caffe { void InsertSplits(const NetParameter& param, NetParameter* param_split); void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param); + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param); string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx); + const int blob_idx); string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx); + const int blob_idx, const int split_idx); } // namespace caffe diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index faef67e3..7bd1d2db 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -39,7 +39,7 @@ inline void MakeTempDir(string* temp_dirname) { strcpy(temp_dirname_cstr, temp_dirname->c_str()); char* mkdtemp_result = mkdtemp(temp_dirname_cstr); CHECK(mkdtemp_result != NULL) - << "Failed to create a temporary directory at: " << *temp_dirname; + << "Failed to create a temporary directory at: " << *temp_dirname; *temp_dirname = temp_dirname_cstr; delete[] temp_dirname_cstr; } @@ -74,13 +74,13 @@ inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) { } inline void ReadProtoFromBinaryFileOrDie(const string& filename, - Message* proto) { + Message* proto) { ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); } void WriteProtoToBinaryFile(const Message& proto, const char* filename); inline void WriteProtoToBinaryFile( - const Message& proto, const string& filename) { + const Message& proto, const string& filename) { WriteProtoToBinaryFile(proto, filename.c_str()); } @@ -91,32 +91,32 @@ inline bool ReadFileToDatum(const string& filename, Datum* datum) { } bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum); + const int height, const int width, const bool is_color, + const std::string & encoding, Datum* datum); inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, Datum* datum) { + const int height, const int width, const bool is_color, Datum* datum) { return ReadImageToDatum(filename, label, height, width, is_color, - "", datum); + "", datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, Datum* datum) { + const int height, const int width, Datum* datum) { return ReadImageToDatum(filename, label, height, width, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const bool is_color, Datum* datum) { + const bool is_color, Datum* datum) { return ReadImageToDatum(filename, label, 0, 0, is_color, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - Datum* datum) { + Datum* datum) { return ReadImageToDatum(filename, label, 0, 0, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const std::string & encoding, Datum* datum) { + const std::string & encoding, Datum* datum) { return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum); } @@ -124,13 +124,13 @@ bool DecodeDatumNative(Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color); + const int height, const int width, const bool is_color); cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width); + const int height, const int width); cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color); + const bool is_color); cv::Mat ReadImageToCVMat(const string& filename); @@ -139,19 +139,19 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); -template +template void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob); -template +template void hdf5_load_nd_dataset( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob); -template +template void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob); + const hid_t file_id, const string& dataset_name, const Blob& blob); } // namespace caffe diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 0a7fd67f..8a36069a 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -40,80 +40,80 @@ namespace caffe { // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. -template +template void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); // Decaf gpu gemm provides an interface that is almost the same as the cpu // gemm function - following the c convention and calling the fortran-order // gpu code under the hood. -template +template void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); -template +template cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, - const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, + Dtype* C, const int offC); /*This is Yuan Gao's sgemm_ex*/ -template +template void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C, const int offset1, const int offset2, const int offset3); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C, const int offset1, const int offset2, const int offset3); -template +template cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, - const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, + Dtype* C, const int offC); -template +template void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); -template +template void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, - const Dtype * x, size_t offx, const Dtype beta, int incx, - Dtype* y, size_t offy, int incy); + const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, + const Dtype * x, size_t offx, const Dtype beta, int incx, + Dtype* y, size_t offy, int incy); -template +template void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); -template +template void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); + Dtype* Y); -template +template void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); + Dtype* Y); -template +template void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); -template +template void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); -template +template void caffe_copy(const int N, const Dtype *X, Dtype *Y); -template +template void caffe_set(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); inline void caffe_memset(const size_t N, const int alpha, void* X) { @@ -130,67 +130,67 @@ inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); -template +template void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y); -template +template void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); -template +template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, - Dtype *X); + Dtype *X); -template +template void caffe_scal(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); -template +template void caffe_sqr(const int N, const Dtype* a, Dtype* y); -template +template void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); //CUDA version, need to be deleted -template +template void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, - const Dtype* b, Dtype* y); + const Dtype* b, Dtype* y); -template +template void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); //CUDA version, need to be deleted -template +template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); unsigned int caffe_rng_rand(); -template +template Dtype caffe_nextafter(const Dtype b); -template +template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); // caffe_gpu_rng_uniform with two arguments generates integers in the range @@ -202,52 +202,52 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r); // specification of curandGenerateUniform. With a = 0, b = 1, just calls // curandGenerateUniform; with other limits will shift and scale the outputs // appropriately after calling curandGenerateUniform. -template +template void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); -template +template void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); -template +template void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, int* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); -template +template void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); -template +template void caffe_exp(const int n, const Dtype* a, Dtype* y); -template +template Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); -template +template void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); -template +template int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); -template +template uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, - const Dtype* y); + const Dtype* y); // Returns the sum of the absolute values of the elements of vector x -template +template Dtype caffe_cpu_asum(const int n, const Dtype* x); -template +template void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); // the branchless, type-safe version from // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c -template +template inline char caffe_sign(Dtype val) { return (Dtype(0) < val) - (val < Dtype(0)); } @@ -288,7 +288,7 @@ void caffe_gpu_##name(const int n, const double* x, double* y) { \ // output is 1 for the positives, 0 for zero, and -1 for the negatives DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); -template +template void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); // This returns a nonzero value if the input has its sign bit set. @@ -296,56 +296,56 @@ void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); using std::signbit; DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); -template +template void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); -template +template void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); -template +template void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); -template +template void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); -template +template void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); -template +template void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); -template +template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); -template +template void caffe_exp(const int n, const Dtype* a, Dtype* y); -template +template void caffe_abs(const int n, const Dtype* a, Dtype* y); -template +template void caffe_log(const int n, const Dtype* a, Dtype* y); -template +template Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, - const Dtype* y, const int incy); + const Dtype* y, const int incy); } // namespace caffe #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp index e0d4d489..06262fbf 100644 --- a/include/caffe/util/mkl_alternate.hpp +++ b/include/caffe/util/mkl_alternate.hpp @@ -81,14 +81,14 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]); // in standard blas. We will simply use a two-step (inefficient, of course) way // to mimic that. inline void cblas_saxpby(const int N, const float alpha, const float* X, - const int incX, const float beta, float* Y, - const int incY) { + const int incX, const float beta, float* Y, + const int incY) { cblas_sscal(N, beta, Y, incY); cblas_saxpy(N, alpha, X, incX, Y, incY); } inline void cblas_daxpby(const int N, const double alpha, const double* X, - const int incX, const double beta, double* Y, - const int incY) { + const int incX, const double beta, double* Y, + const int incY) { cblas_dscal(N, beta, Y, incY); cblas_daxpy(N, alpha, X, incX, Y, incY); } diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 1bd7c8d4..9febaa04 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -29,11 +29,11 @@ namespace caffe { -template +template void ocl_memset(Dtype* buffer, const Dtype value, const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, - const int count); + const int count); void eventCallback(cl_event event, cl_int event_status, void * user_data); } // namespace caffe diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index dbd712ea..3a9eaa5c 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -31,7 +31,7 @@ namespace caffe { typedef unsigned int uint32_t; -template inline std::string get_dtype_suffix() +template inline std::string get_dtype_suffix() { dtype x; const char type = typeid(x).name()[0]; @@ -50,289 +50,293 @@ template inline std::string get_dtype_suffix() return suffix; } -template +template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, - const int M_, const int packing_num); + const int M_, const int packing_num); -template +template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, - const int optnum); + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum); -template +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* bottom_data, Dtype* scale_data); + const Dtype* bottom_data, Dtype* scale_data); -template +template void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out); -template +template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* scale, Dtype* data); + const Dtype* scale, Dtype* data); -template +template Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* prob_data, const Dtype* label, cl_mem d_loss); + const Dtype* prob_data, const Dtype* label, cl_mem d_loss); -template +template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data); -template +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, - const Dtype* label); + const Dtype* label); -template +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - Dtype* top_data); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data); -template +template void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, - Dtype* top_mask); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask); -template +template void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff); + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); -template +template void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff); + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); -template +template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, - const Dtype* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, Dtype* const bottom_diff); -template + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff); +template void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data); -template +template void SigmoidBackward(const int count, const Dtype* top_diff, - const Dtype* top_data, Dtype* bottom_diff); + const Dtype* top_data, Dtype* bottom_diff); -template +template void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data); -template +template void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, - Dtype* bottom_diff); + Dtype* bottom_diff); -template +template void ThresholdForward(const int count, const Dtype threshold, - const Dtype* bottom_data, Dtype* top_data); + const Dtype* bottom_data, Dtype* top_data); -template +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, Dtype* top_data); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data); -template +template void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data); -template +template void StoPoolForwardTrain(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* idx_data, Dtype* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data); -template +template void StoPoolForwardTest(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data); -template +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, Dtype* bottom_diff); + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff); -template +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, - const int clnum, const int channels_, const int intheight_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, const int pad_, Dtype* bottom_diff); + const int clnum, const int channels_, const int intheight_, + const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff); -template +template void PReLUForward(const int count, const int channels, const int dim, - const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, - const int div_factor); + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor); -template +template void PReLUBackward(const int count, const int channels, const int dim, - const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, - const Dtype* slope_data, const int div_factor); + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor); -template +template void PReLUParamBackward(const int count, const Dtype* top_diff, - const int offset_out, const Dtype* bottom_data, const int offset_in, - Dtype* bottom_diff); + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff); -template +template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, - Dtype negative_slope); + Dtype negative_slope); -template +template void ReLUBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); -template +template void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); -template +template void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype *top_data); + const int* MaskMem, const Dtype scale_, Dtype *top_data); -template +template void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff); + const float threshold_, const Dtype scale_, Dtype* bottom_diff); -template +template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, - Dtype threshold); + Dtype threshold); -template +template void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); -template +template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); -template +template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out); + const int spatial_dim, const Dtype* data, Dtype* out); -template +template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data); + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data); -template +template void kernel_powx(const int count, const Dtype* data, const Dtype alpha, - Dtype* out); + Dtype* out); -template +template void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_log(const int count, const Dtype* data, Dtype* out); -template +template void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out); -template +template void kernel_add_scalar(const int count, const Dtype data, Dtype* out); -template +template void kernel_exp(const int count, const Dtype* data, Dtype* out); -template +template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum); + const int spatial_dim, const Dtype* data, Dtype* channel_sum); -template +template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data); + const int spatial_dim, const Dtype* channel_sum, Dtype* data); -template +template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot); + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot); -template +template void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts); + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts); -template +template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts); + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); -template +template void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); -template +template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); template void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale); + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale); template void LRNComputeOutput(int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out); + Dtype* scale, Dtype negative_beta, Dtype* out); template void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff); + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff); template -void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y); +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); template -void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y); +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); template -void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff); +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff); template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data); +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, Dtype *out_data); template void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff); + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff); -template +template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask); + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask); -template +template void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff); + const int blob_idx, const int* mask, Dtype* bottom_diff); } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ // namespace caffe diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp index b59d9a67..7688e16a 100644 --- a/include/caffe/util/rng.hpp +++ b/include/caffe/util/rng.hpp @@ -18,9 +18,9 @@ inline rng_t* caffe_rng() { } // Fisher–Yates algorithm -template +template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end, - RandomGenerator* gen) { + RandomGenerator* gen) { typedef typename std::iterator_traits::difference_type difference_type; typedef typename boost::uniform_int dist_type; @@ -35,7 +35,7 @@ inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end, } } -template +template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) { shuffle(begin, end, caffe_rng()); } diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp index d140e029..2dc3cceb 100644 --- a/include/caffe/util/upgrade_proto.hpp +++ b/include/caffe/util/upgrade_proto.hpp @@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param); // taking its top blob as input. // Error if any of these above layers are not-conv layers. void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad); + NetParameter* param_upgraded_pad); // Upgrade a single V0LayerConnection to the V1LayerParameter format. bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param); + V1LayerParameter* layer_param); V1LayerParameter_LayerType UpgradeV0LayerType(const string& type); @@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param); bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param); bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param); + LayerParameter* layer_param); const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type); @@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param); // Read parameters from a file into a NetParameter proto message. void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index eb959190..0c954fa2 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -20,17 +20,18 @@ namespace caffe { * @brief Abstract base class that factors out the BLAS code common to * ConvolutionLayer and DeconvolutionLayer. */ -template +template class BaseConvolutionLayer: public Layer { public: explicit BaseConvolutionLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual ~BaseConvolutionLayer(); virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline int MinBottomBlobs() const { return 1; @@ -47,31 +48,31 @@ class BaseConvolutionLayer: public Layer { // The last argument in forward_cpu_gemm is so that we can skip the im2col if // we just called weight_cpu_gemm with the same input. void forward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); + Dtype* output, bool skip_im2col = false); void forward_cpu_bias(Dtype* output, const Dtype* bias); void backward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output); + Dtype* output); void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* - weights); + weights); void backward_cpu_bias(Dtype* bias, const Dtype* input); //opencl related setup void ocl_setup(); #ifndef CPU_ONLY void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); + Dtype* output, bool skip_im2col = false); void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); + Dtype* output, bool skip_im2col = false); void forward_gpu_bias(Dtype* output, const Dtype* bias); void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); void backward_gpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* col_output); + Dtype* col_output); void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, - Dtype* col_output); + Dtype* col_output); void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* - weights); + weights); void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype* - weights); + weights); void backward_gpu_bias(Dtype* bias, const Dtype* input); #endif @@ -97,44 +98,44 @@ class BaseConvolutionLayer: public Layer { // wrap im2col/col2im so we don't have to remember the (long) argument lists inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); } inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); } #ifndef CPU_ONLY inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, - 0); + conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, + 0); } inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, - bottom_offset_); + conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, + bottom_offset_); } protected: inline void conv_im2col_gpu_opt(const Dtype* data) { im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2); + conv_in_width_, + kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2); } inline void conv_col2im_gpu_opt(Dtype* data) { col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); + conv_in_width_, + kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); } private: inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_, - M_ * opt_num2, opt_num2); + M_ * opt_num2, opt_num2); } inline void conv_transpose_gpu(const Dtype* data) { opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, - opt_num2); + opt_num2); } protected: inline void gpu_memset(Dtype* data, Dtype value, int count) { @@ -182,7 +183,7 @@ class BaseConvolutionLayer: public Layer { * be filtered. col2im restores the output spatial structure by rolling up * the output channel N' columns of the output matrix. */ -template +template class ConvolutionLayer: public BaseConvolutionLayer { public: /** @@ -214,7 +215,8 @@ class ConvolutionLayer: public BaseConvolutionLayer { * kernels + stream parallelism) engines. */ explicit ConvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) { + : + BaseConvolutionLayer(param) { } virtual inline const char* type() const { @@ -223,26 +225,26 @@ class ConvolutionLayer: public BaseConvolutionLayer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual inline bool reverse_dimensions() { return false; } virtual void compute_output_shape(); virtual void Forward_gpu_org(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Forward_gpu_opt2(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); }; /** @@ -259,11 +261,12 @@ class ConvolutionLayer: public BaseConvolutionLayer { * padding is removed from the output rather than added to the input, and * stride results in upsampling rather than downsampling). */ -template +template class DeconvolutionLayer: public BaseConvolutionLayer { public: explicit DeconvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) { + : + BaseConvolutionLayer(param) { } virtual inline const char* type() const { @@ -272,13 +275,13 @@ class DeconvolutionLayer: public BaseConvolutionLayer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual inline bool reverse_dimensions() { return true; } @@ -306,16 +309,16 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { explicit CuDNNConvolutionLayer(const LayerParameter& param) : ConvolutionLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNConvolutionLayer(); protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t* handle_; @@ -337,16 +340,17 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class Im2colLayer: public Layer { public: explicit Im2colLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Im2col"; @@ -360,13 +364,13 @@ class Im2colLayer: public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int kernel_h_, kernel_w_; int stride_h_, stride_w_; @@ -376,8 +380,8 @@ class Im2colLayer: public Layer { }; // Forward declare PoolingLayer and SplitLayer for use in LRNLayer. -template class PoolingLayer; -template class SplitLayer; +template class PoolingLayer; +template class SplitLayer; /** * @brief Normalize the input in a local region across or within feature maps. @@ -385,73 +389,81 @@ template class SplitLayer; * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class LRNLayer : public Layer { - public: - explicit LRNLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "LRN"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - virtual void CrossChannelForward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void WithinChannelForward(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelBackward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void WithinChannelBackward(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int size_; - int pre_pad_; - Dtype alpha_; - Dtype beta_; - Dtype k_; - int num_; - int channels_; - int height_; - int width_; - - // Fields used for normalization ACROSS_CHANNELS - // scale_ stores the intermediate summing results - Blob scale_; - - // Fields used for normalization WITHIN_CHANNEL - shared_ptr > split_layer_; - vector*> split_top_vec_; - shared_ptr > square_layer_; - Blob square_input_; - Blob square_output_; - vector*> square_bottom_vec_; - vector*> square_top_vec_; - shared_ptr > pool_layer_; - Blob pool_output_; - vector*> pool_top_vec_; - shared_ptr > power_layer_; - Blob power_output_; - vector*> power_top_vec_; - shared_ptr > product_layer_; - Blob product_input_; - vector*> product_bottom_vec_; +class LRNLayer: public Layer { + public: + explicit LRNLayer(const LayerParameter& param) + : + Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "LRN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + virtual void CrossChannelForward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void WithinChannelForward(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int size_; + int pre_pad_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int num_; + int channels_; + int height_; + int width_; + + // Fields used for normalization ACROSS_CHANNELS + // scale_ stores the intermediate summing results + Blob scale_; + + // Fields used for normalization WITHIN_CHANNEL + shared_ptr > split_layer_; + vector*> split_top_vec_; + shared_ptr > square_layer_; + Blob square_input_; + Blob square_output_; + vector*> square_bottom_vec_; + vector*> square_top_vec_; + shared_ptr > pool_layer_; + Blob pool_output_; + vector*> pool_top_vec_; + shared_ptr > power_layer_; + Blob power_output_; + vector*> power_top_vec_; + shared_ptr > product_layer_; + Blob product_input_; + vector*> product_bottom_vec_; }; @@ -460,16 +472,17 @@ class LRNLayer : public Layer { * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ -template +template class PoolingLayer: public Layer { public: explicit PoolingLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "Pooling"; @@ -484,18 +497,18 @@ class PoolingLayer: public Layer { // others can only output the pooled inputs. virtual inline int MaxTopBlobs() const { return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; + PoolingParameter_PoolMethod_MAX) ? 2 : 1; } protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); int kernel_h_, kernel_w_; int stride_h_, stride_w_; @@ -520,9 +533,9 @@ class CuDNNPoolingLayer : public PoolingLayer { explicit CuDNNPoolingLayer(const LayerParameter& param) : PoolingLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual ~CuDNNPoolingLayer(); // Currently, cuDNN does not support the extra top blob. virtual inline int MinTopBlobs() const {return -1;} @@ -530,9 +543,9 @@ class CuDNNPoolingLayer : public PoolingLayer { protected: virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; cudnnHandle_t handle_; @@ -548,16 +561,17 @@ class CuDNNPoolingLayer : public PoolingLayer { * so that the result vector of different sized * images are of the same size. */ -template +template class SPPLayer: public Layer { public: explicit SPPLayer(const LayerParameter& param) - : Layer(param) { + : + Layer(param) { } virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Reshape(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual inline const char* type() const { return "SPP"; @@ -572,18 +586,18 @@ class SPPLayer: public Layer { // others can only output the pooled inputs. virtual inline int MaxTopBlobs() const { return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; + PoolingParameter_PoolMethod_MAX) ? 2 : 1; } protected: virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top); virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); // calculates the kernel and stride dimensions for the pooling layer, // returns a correctly configured LayerParameter for a PoolingLayer virtual LayerParameter GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param); + const int bottom_h, const int bottom_w, const SPPParameter spp_param); int pyramid_height_; int bottom_h_, bottom_w_; diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index e7d129bb..5e327c67 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -8,9 +8,9 @@ namespace caffe { -template +template void Blob::Reshape(const int num, const int channels, const int height, - const int width) { + const int width) { vector shape(4); shape[0] = num; shape[1] = channels; @@ -19,7 +19,7 @@ void Blob::Reshape(const int num, const int channels, const int height, Reshape(shape); } -template +template void Blob::Reshape(const vector& shape) { CHECK_LE(shape.size(), kMaxBlobAxes); count_ = 1; @@ -37,7 +37,7 @@ void Blob::Reshape(const vector& shape) { } } -template +template void Blob::Reshape(const BlobShape& shape) { CHECK_LE(shape.dim_size(), kMaxBlobAxes); vector shape_vec(shape.dim_size()); @@ -47,93 +47,95 @@ void Blob::Reshape(const BlobShape& shape) { Reshape(shape_vec); } -template +template void Blob::ReshapeLike(const Blob& other) { Reshape(other.shape()); } -template +template Blob::Blob(const int num, const int channels, const int height, - const int width) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { + const int width) +// capacity_ must be initialized before calling Reshape +: + capacity_(0) { Reshape(num, channels, height, width); } -template +template Blob::Blob(const vector& shape) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { +// capacity_ must be initialized before calling Reshape +: + capacity_(0) { Reshape(shape); } -template +template const Dtype* Blob::cpu_data() const { CHECK (data_); return (const Dtype*) data_->cpu_data(); } -template +template void Blob::set_cpu_data(Dtype* data) { CHECK(data); data_->set_cpu_data(data); } -template +template const Dtype* Blob::gpu_data() const { CHECK (data_); return (const Dtype*) data_->gpu_data(); } -template +template const Dtype* Blob::gpu_cache_data() const { CHECK (data_); return (const Dtype*) data_->gpu_cache_data(); } -template +template const Dtype* Blob::cpu_diff() const { CHECK (diff_); return (const Dtype*) diff_->cpu_data(); } -template +template const Dtype* Blob::gpu_diff() const { CHECK (diff_); return (const Dtype*) diff_->gpu_data(); } -template +template Dtype* Blob::mutable_cpu_data() { CHECK (data_); return static_cast(data_->mutable_cpu_data()); } -template +template Dtype* Blob::mutable_gpu_data() { CHECK (data_); return static_cast(data_->mutable_gpu_data()); } -template +template Dtype* Blob::mutable_cpu_diff() { CHECK (diff_); return static_cast(diff_->mutable_cpu_data()); } -template +template Dtype* Blob::mutable_gpu_diff() { CHECK (diff_); return static_cast(diff_->mutable_gpu_data()); } -template +template void Blob::ShareData(const Blob& other) { CHECK_EQ(count_, other.count()); data_ = other.data(); } -template +template void Blob::ShareDiff(const Blob& other) { CHECK_EQ(count_, other.count()); diff_ = other.diff(); @@ -142,30 +144,30 @@ void Blob::ShareDiff(const Blob& other) { // The "update" method is used for parameter blobs in a Net, which are stored // as Blob or Blob -- hence we do not define it for // Blob or Blob. -template<> void Blob::Update() { +template <> void Blob::Update() { NOT_IMPLEMENTED; } -template<> void Blob::Update() { +template <> void Blob::Update() { NOT_IMPLEMENTED; } -template +template void Blob::Update() { // We will perform update based on where the data is located. switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: // perform computation on CPU caffe_axpy < Dtype > (count_, Dtype(-1), - static_cast(diff_->cpu_data()), - static_cast(data_->mutable_cpu_data())); + static_cast(diff_->cpu_data()), + static_cast(data_->mutable_cpu_data())); break; case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: #ifndef CPU_ONLY // perform computation on GPU caffe_gpu_axpy < Dtype > (count_, Dtype(-1), - static_cast(diff_->gpu_data()), - static_cast(data_->mutable_gpu_data())); + static_cast(diff_->gpu_data()), + static_cast(data_->mutable_gpu_data())); #else NO_GPU; #endif @@ -175,17 +177,17 @@ void Blob::Update() { } } -template<> unsigned int Blob::asum_data() const { +template <> unsigned int Blob::asum_data() const { NOT_IMPLEMENTED; return 0; } -template<> int Blob::asum_data() const { +template <> int Blob::asum_data() const { NOT_IMPLEMENTED; return 0; } -template +template Dtype Blob::asum_data() const { if (!data_) { return 0; @@ -212,17 +214,17 @@ Dtype Blob::asum_data() const { return 0; } -template<> unsigned int Blob::asum_diff() const { +template <> unsigned int Blob::asum_diff() const { NOT_IMPLEMENTED; return 0; } -template<> int Blob::asum_diff() const { +template <> int Blob::asum_diff() const { NOT_IMPLEMENTED; return 0; } -template +template Dtype Blob::asum_diff() const { if (!diff_) { return 0; @@ -249,17 +251,17 @@ Dtype Blob::asum_diff() const { return 0; } -template<> unsigned int Blob::sumsq_data() const { +template <> unsigned int Blob::sumsq_data() const { NOT_IMPLEMENTED; return 0; } -template<> int Blob::sumsq_data() const { +template <> int Blob::sumsq_data() const { NOT_IMPLEMENTED; return 0; } -template +template Dtype Blob::sumsq_data() const { Dtype sumsq; const Dtype* data; @@ -288,17 +290,17 @@ Dtype Blob::sumsq_data() const { return sumsq; } -template<> unsigned int Blob::sumsq_diff() const { +template <> unsigned int Blob::sumsq_diff() const { NOT_IMPLEMENTED; return 0; } -template<> int Blob::sumsq_diff() const { +template <> int Blob::sumsq_diff() const { NOT_IMPLEMENTED; return 0; } -template +template Dtype Blob::sumsq_diff() const { Dtype sumsq; const Dtype* diff; @@ -327,15 +329,15 @@ Dtype Blob::sumsq_diff() const { return sumsq; } -template<> void Blob::scale_data(unsigned int scale_factor) { +template <> void Blob::scale_data(unsigned int scale_factor) { NOT_IMPLEMENTED; } -template<> void Blob::scale_data(int scale_factor) { +template <> void Blob::scale_data(int scale_factor) { NOT_IMPLEMENTED; } -template +template void Blob::scale_data(Dtype scale_factor) { Dtype* data; if (!data_) { @@ -362,15 +364,15 @@ void Blob::scale_data(Dtype scale_factor) { } } -template<> void Blob::scale_diff(unsigned int scale_factor) { +template <> void Blob::scale_diff(unsigned int scale_factor) { NOT_IMPLEMENTED; } -template<> void Blob::scale_diff(int scale_factor) { +template <> void Blob::scale_diff(int scale_factor) { NOT_IMPLEMENTED; } -template +template void Blob::scale_diff(Dtype scale_factor) { Dtype* diff; if (!diff_) { @@ -397,10 +399,10 @@ void Blob::scale_diff(Dtype scale_factor) { } } -template +template bool Blob::ShapeEquals(const BlobProto& other) { if (other.has_num() || other.has_channels() || - other.has_height() || other.has_width()) { + other.has_height() || other.has_width()) { // Using deprecated 4D Blob dimensions -- // shape is (num, channels, height, width). // Note: we do not use the normal Blob::num(), Blob::channels(), etc. @@ -408,10 +410,10 @@ bool Blob::ShapeEquals(const BlobProto& other) { // parameter blobs were indexed from the end of the blob shape (e.g., bias // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). return shape_.size() <= 4 && - LegacyShape(-4) == other.num() && - LegacyShape(-3) == other.channels() && - LegacyShape(-2) == other.height() && - LegacyShape(-1) == other.width(); + LegacyShape(-4) == other.num() && + LegacyShape(-3) == other.channels() && + LegacyShape(-2) == other.height() && + LegacyShape(-1) == other.width(); } vector other_shape(other.shape().dim_size()); for (int i = 0; i < other.shape().dim_size(); ++i) { @@ -420,7 +422,7 @@ bool Blob::ShapeEquals(const BlobProto& other) { return shape_ == other_shape; } -template +template void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { if (source.count() != count_ || source.shape() != shape_) { if (reshape) { @@ -433,19 +435,19 @@ void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { case Caffe::GPU: if (copy_diff) { caffe_copy(count_, source.gpu_diff(), - static_cast(diff_->mutable_gpu_data())); + static_cast(diff_->mutable_gpu_data())); } else { caffe_copy(count_, source.gpu_data(), - static_cast(data_->mutable_gpu_data())); + static_cast(data_->mutable_gpu_data())); } break; case Caffe::CPU: if (copy_diff) { caffe_copy(count_, source.cpu_diff(), - static_cast(diff_->mutable_cpu_data())); + static_cast(diff_->mutable_cpu_data())); } else { caffe_copy(count_, source.cpu_data(), - static_cast(data_->mutable_cpu_data())); + static_cast(data_->mutable_cpu_data())); } break; default: @@ -453,12 +455,12 @@ void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { } } -template +template void Blob::FromProto(const BlobProto& proto, bool reshape) { if (reshape) { vector shape; if (proto.has_num() || proto.has_channels() || - proto.has_height() || proto.has_width()) { + proto.has_height() || proto.has_width()) { // Using deprecated 4D Blob dimensions -- // shape is (num, channels, height, width). shape.resize(4); @@ -489,7 +491,7 @@ void Blob::FromProto(const BlobProto& proto, bool reshape) { } } -template +template void Blob::ToProto(BlobProto* proto, bool write_diff) const { proto->clear_shape(); for (int i = 0; i < shape_.size(); ++i) { diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 22e9059b..2157c96a 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -91,7 +91,7 @@ void* Caffe::RNG::generator() { Caffe::Caffe() { - amdDevice.Init(); + amdDevice.Init(); cl_int err = clblasSetup(); if (err != CL_SUCCESS) { LOG(ERROR) << "clBLAS setup failed " << err; @@ -121,10 +121,12 @@ void Caffe::DeviceQuery() { class Caffe::RNG::Generator { public: Generator() - : rng_(new caffe::rng_t(cluster_seedgen())) { + : + rng_(new caffe::rng_t(cluster_seedgen())) { } explicit Generator(unsigned int seed) - : rng_(new caffe::rng_t(seed)) { + : + rng_(new caffe::rng_t(seed)) { } caffe::rng_t* rng() { return rng_.get(); @@ -134,11 +136,13 @@ class Caffe::RNG::Generator { }; Caffe::RNG::RNG() - : generator_(new Generator()) { +: + generator_(new Generator()) { } Caffe::RNG::RNG(unsigned int seed) - : generator_(new Generator(seed)) { +: + generator_(new Generator(seed)) { } Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 892d758d..a041e126 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -10,14 +10,15 @@ #include "caffe/util/benchmark.hpp" namespace caffe { -template +template DataTransformer::DataTransformer(const TransformationParameter& param, - Phase phase) - : param_(param), phase_(phase) { + Phase phase) +: + param_(param), phase_(phase) { // check if we want to use mean_file if (param_.has_mean_file()) { CHECK_EQ(param_.mean_value_size(), 0) << - "Cannot specify mean_file and mean_value at the same time"; + "Cannot specify mean_file and mean_value at the same time"; const string& mean_file = param.mean_file(); LOG(INFO) << "Loading mean file from: " << mean_file; BlobProto blob_proto; @@ -27,16 +28,16 @@ DataTransformer::DataTransformer(const TransformationParameter& param, // check if we want to use mean_value if (param_.mean_value_size() > 0) { CHECK(param_.has_mean_file() == false) << - "Cannot specify mean_file and mean_value at the same time"; + "Cannot specify mean_file and mean_value at the same time"; for (int c = 0; c < param_.mean_value_size(); ++c) { mean_values_.push_back(param_.mean_value(c)); } } } -template +template void DataTransformer::Transform(const Datum& datum, - Dtype* transformed_data) { + Dtype* transformed_data) { const string& data = datum.data(); const int datum_channels = datum.channels(); const int datum_height = datum.height(); @@ -62,7 +63,8 @@ void DataTransformer::Transform(const Datum& datum, } if (has_mean_values) { CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) << - "Specify either 1 mean_value or as many as channels: " << datum_channels; + "Specify either 1 mean_value or as many as channels: " + << datum_channels; if (datum_channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < datum_channels; ++c) { @@ -102,17 +104,17 @@ void DataTransformer::Transform(const Datum& datum, } if (has_uint8) { datum_element = - static_cast(static_cast(data[data_index])); + static_cast(static_cast(data[data_index])); } else { datum_element = datum.float_data(data_index); } if (has_mean_file) { transformed_data[top_index] = - (datum_element - mean[data_index]) * scale; + (datum_element - mean[data_index]) * scale; } else { if (has_mean_values) { transformed_data[top_index] = - (datum_element - mean_values_[c]) * scale; + (datum_element - mean_values_[c]) * scale; } else { transformed_data[top_index] = datum_element * scale; } @@ -122,14 +124,14 @@ void DataTransformer::Transform(const Datum& datum, } } -template +template void DataTransformer::Transform(const Datum& datum, - Blob* transformed_blob) { + Blob* transformed_blob) { // If datum is encoded, decoded and transform the cv::image. if (datum.encoded()) { CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; + << "cannot set both force_color and force_gray"; cv::Mat cv_img; if (param_.force_color() || param_.force_gray()) { // If force_color then decode in color otherwise decode in gray. @@ -173,9 +175,9 @@ void DataTransformer::Transform(const Datum& datum, Transform(datum, transformed_data); } -template +template void DataTransformer::Transform(const vector & datum_vector, - Blob* transformed_blob) { + Blob* transformed_blob) { const int datum_num = datum_vector.size(); const int num = transformed_blob->num(); const int channels = transformed_blob->channels(); @@ -183,8 +185,9 @@ void DataTransformer::Transform(const vector & datum_vector, const int width = transformed_blob->width(); CHECK_GT(datum_num, 0) << "There is no datum to add"; - CHECK_LE(datum_num, num) << - "The size of datum_vector must be no greater than transformed_blob->num()"; + CHECK_LE(datum_num, num) + << + "The size of datum_vector must be no greater than transformed_blob->num()"; Blob < Dtype > uni_blob(1, channels, height, width); for (int item_id = 0; item_id < datum_num; ++item_id) { int offset = transformed_blob->offset(item_id); @@ -193,9 +196,9 @@ void DataTransformer::Transform(const vector & datum_vector, } } -template +template void DataTransformer::Transform(const vector & mat_vector, - Blob* transformed_blob) { + Blob* transformed_blob) { const int mat_num = mat_vector.size(); const int num = transformed_blob->num(); const int channels = transformed_blob->channels(); @@ -204,7 +207,7 @@ void DataTransformer::Transform(const vector & mat_vector, CHECK_GT(mat_num, 0) << "There is no MAT to add"; CHECK_EQ(mat_num, num) << - "The size of mat_vector must be equals to transformed_blob->num()"; + "The size of mat_vector must be equals to transformed_blob->num()"; Blob < Dtype > uni_blob(1, channels, height, width); for (int item_id = 0; item_id < mat_num; ++item_id) { int offset = transformed_blob->offset(item_id); @@ -213,9 +216,9 @@ void DataTransformer::Transform(const vector & mat_vector, } } -template +template void DataTransformer::Transform(const cv::Mat& cv_img, - Blob* transformed_blob) { + Blob* transformed_blob) { const int crop_size = param_.crop_size(); const int img_channels = cv_img.channels(); const int img_height = cv_img.rows; @@ -252,7 +255,7 @@ void DataTransformer::Transform(const cv::Mat& cv_img, } if (has_mean_values) { CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) << - "Specify either 1 mean_value or as many as channels: " << img_channels; + "Specify either 1 mean_value or as many as channels: " << img_channels; if (img_channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < img_channels; ++c) { @@ -301,11 +304,11 @@ void DataTransformer::Transform(const cv::Mat& cv_img, if (has_mean_file) { int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; transformed_data[top_index] = - (pixel - mean[mean_index]) * scale; + (pixel - mean[mean_index]) * scale; } else { if (has_mean_values) { transformed_data[top_index] = - (pixel - mean_values_[c]) * scale; + (pixel - mean_values_[c]) * scale; } else { transformed_data[top_index] = pixel * scale; } @@ -315,9 +318,9 @@ void DataTransformer::Transform(const cv::Mat& cv_img, } } -template +template void DataTransformer::Transform(Blob* input_blob, - Blob* transformed_blob) { + Blob* transformed_blob) { const int crop_size = param_.crop_size(); const int input_num = input_blob->num(); const int input_channels = input_blob->channels(); @@ -328,10 +331,10 @@ void DataTransformer::Transform(Blob* input_blob, // Initialize transformed_blob with the right shape. if (crop_size) { transformed_blob->Reshape(input_num, input_channels, - crop_size, crop_size); + crop_size, crop_size); } else { transformed_blob->Reshape(input_num, input_channels, - input_height, input_width); + input_height, input_width); } } @@ -377,13 +380,14 @@ void DataTransformer::Transform(Blob* input_blob, for (int n = 0; n < input_num; ++n) { int offset = input_blob->offset(n); caffe_sub(data_mean_.count(), input_data + offset, - data_mean_.cpu_data(), input_data + offset); + data_mean_.cpu_data(), input_data + offset); } } if (has_mean_values) { CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) << - "Specify either 1 mean_value or as many as channels: " << input_channels; + "Specify either 1 mean_value or as many as channels: " + << input_channels; if (mean_values_.size() == 1) { caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); } else { @@ -391,7 +395,7 @@ void DataTransformer::Transform(Blob* input_blob, for (int c = 0; c < input_channels; ++c) { int offset = input_blob->offset(n, c); caffe_add_scalar(input_height * input_width, -(mean_values_[c]), - input_data + offset); + input_data + offset); } } } @@ -427,11 +431,11 @@ void DataTransformer::Transform(Blob* input_blob, } } -template +template vector DataTransformer::InferBlobShape(const Datum& datum) { if (datum.encoded()) { CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; + << "cannot set both force_color and force_gray"; cv::Mat cv_img; if (param_.force_color() || param_.force_gray()) { // If force_color then decode in color otherwise decode in gray. @@ -460,9 +464,9 @@ vector DataTransformer::InferBlobShape(const Datum& datum) { return shape; } -template +template vector DataTransformer::InferBlobShape( - const vector & datum_vector) { + const vector & datum_vector) { const int num = datum_vector.size(); CHECK_GT(num, 0) << "There is no datum to in the vector"; // Use first datum in the vector to InferBlobShape. @@ -472,7 +476,7 @@ vector DataTransformer::InferBlobShape( return shape; } -template +template vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { const int crop_size = param_.crop_size(); const int img_channels = cv_img.channels(); @@ -491,9 +495,9 @@ vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { return shape; } -template +template vector DataTransformer::InferBlobShape( - const vector & mat_vector) { + const vector & mat_vector) { const int num = mat_vector.size(); CHECK_GT(num, 0) << "There is no cv_img to in the vector"; // Use first cv_img in the vector to InferBlobShape. @@ -503,10 +507,10 @@ vector DataTransformer::InferBlobShape( return shape; } -template +template void DataTransformer::InitRand() { const bool needs_rand = param_.mirror() || - (phase_ == TRAIN && param_.crop_size()); + (phase_ == TRAIN && param_.crop_size()); if (needs_rand) { const unsigned int rng_seed = caffe_rng_rand(); rng_.reset(new Caffe::RNG(rng_seed)); @@ -515,12 +519,12 @@ void DataTransformer::InitRand() { } } -template +template int DataTransformer::Rand(int n) { CHECK (rng_); CHECK_GT(n, 0); caffe::rng_t* rng = - static_cast(rng_->generator()); + static_cast(rng_->generator()); return ((*rng)() % n); } diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 689f706e..9e53a66a 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -58,7 +58,7 @@ cl_int Device::Init(int deviceId) { size_t nameLen; cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, - platformName, &nameLen); + platformName, &nameLen); if (res != CL_SUCCESS) { fprintf(stderr, "Err: Failed to Get Platform Info\n"); return 0; @@ -75,13 +75,14 @@ cl_int Device::Init(int deviceId) { } else { pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id)); OCL_CHECK( - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, - &uiNumDevices)); + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, + pDevices, + &uiNumDevices)); if (deviceId == -1) { int i; for (i = 0; i < (int) uiNumDevices; i++) { clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, - sizeof(cl_bool), &unified_memory, NULL); + sizeof(cl_bool), &unified_memory, NULL); if (!unified_memory) { //skip iGPU //we pick the first dGPU we found pDevices[0] = pDevices[i]; @@ -108,9 +109,9 @@ cl_int Device::Init(int deviceId) { return 0; } CommandQueue = clCreateCommandQueue(Context, pDevices[0], - CL_QUEUE_PROFILING_ENABLE, NULL); + CL_QUEUE_PROFILING_ENABLE, NULL); CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], - CL_QUEUE_PROFILING_ENABLE, NULL); + CL_QUEUE_PROFILING_ENABLE, NULL); if (NULL == CommandQueue || NULL == CommandQueue_helper) { fprintf(stderr, "Err: Failed to Create Commandqueue\n"); return 0; @@ -122,12 +123,12 @@ cl_int Device::Init(int deviceId) { } void Device::BuildProgram(std::string kernel_dir) - { + { std::string strSource = ""; DIR *ocl_dir; struct dirent *dirp; if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) - { + { fprintf(stderr, "Err: Open ocl dir failed!\n"); } while ((dirp = readdir(ocl_dir)) != NULL) @@ -152,18 +153,18 @@ void Device::BuildProgram(std::string kernel_dir) uiArrSourceSize[0] = strlen(pSource); Program = NULL; Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, - NULL); + NULL); if (NULL == Program) { fprintf(stderr, "Err: Failed to create program\n"); } cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), - NULL, NULL); + NULL, NULL); LOG(INFO) << "Build Program"; if (CL_SUCCESS != iStatus) { fprintf(stderr, "Err: Failed to build program\n"); char szBuildLog[16384]; clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, - sizeof(szBuildLog), szBuildLog, NULL); + sizeof(szBuildLog), szBuildLog, NULL); std::cout << szBuildLog; clReleaseProgram (Program); } @@ -198,10 +199,10 @@ cl_int Device::ConvertToString(std::string pFileName, std::string &Str) { } cl_kernel Device::GetKernel(std::string kernel_name) - { + { std::map::iterator it = Kernels.find(kernel_name); if (it == Kernels.end()) - { + { cl_int _err = 0; cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err); OCL_CHECK(_err); @@ -214,7 +215,7 @@ void Device::ReleaseKernels() { std::map::iterator it; for (it = Kernels.begin(); it != Kernels.end(); it++) - { + { clReleaseKernel(it->second); } } @@ -224,16 +225,16 @@ void Device::DisplayPlatformInfo() { err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) - { + { LOG(ERROR) << "Failed to find any OpenCL platform."; return; } platformIDs = (cl_platform_id *) malloc( - sizeof(cl_platform_id) * numPlatforms); + sizeof(cl_platform_id) * numPlatforms); err = clGetPlatformIDs(numPlatforms, platformIDs, NULL); if (err != CL_SUCCESS) - { + { LOG(ERROR) << "Failed to find any OpenCL platform."; return; } @@ -247,19 +248,19 @@ void Device::DisplayPlatformInfo() { DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, - "CL_PLATFORM_EXTENSIONS"); + "CL_PLATFORM_EXTENSIONS"); } } void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, - std::string str) { + std::string str) { cl_int err; std::size_t paramValueSize; err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); if (err != CL_SUCCESS) - { + { LOG(ERROR) << "Failed to find OpenCL platform:" << str; return; } @@ -267,7 +268,7 @@ void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, char * info = (char *) alloca(sizeof(char) * paramValueSize); err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); if (err != CL_SUCCESS) - { + { LOG(ERROR) << "Failed to find OpenCL platform:" << str; return; } @@ -280,10 +281,10 @@ void Device::GetDeviceInfo() { //by default, we select the first platform. can be extended for more platforms //query GPU device for now err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, - &numDevices); + &numDevices); // we allow program run if no GPU is found. Just return. No error reported. if (numDevices < 1) - { + { LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; return; @@ -291,9 +292,9 @@ void Device::GetDeviceInfo() { DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices); err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, - DeviceIDs, NULL); + DeviceIDs, NULL); if (err != CL_SUCCESS) - { + { LOG(INFO) << "Failed to find any GPU devices."; return; } @@ -302,35 +303,35 @@ void Device::GetDeviceInfo() { for (cl_uint i = 0; i < numDevices; i++) { LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i]; DisplayDeviceInfo < cl_device_type - > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); DisplayDeviceInfo < size_t - > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, - "Max work item sizes"); + "Max work item sizes"); DisplayDeviceInfo < cl_command_queue_properties - > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); DisplayDeviceInfo < cl_device_exec_capabilities - > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); } } @@ -345,7 +346,7 @@ void Device::DeviceQuery() size_t nameLen; cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, - platformName, &nameLen); + platformName, &nameLen); if (res != CL_SUCCESS) { fprintf(stderr, "Err: Failed to Get Platform Info\n"); return; @@ -355,15 +356,15 @@ void Device::DeviceQuery() GetDeviceInfo(); } -template +template void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, - std::string str) { + std::string str) { cl_int err; std::size_t paramValueSize; err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); if (err != CL_SUCCESS) - { + { LOG(ERROR) << "Failed to find OpenCL device info:" << str; return; } @@ -372,7 +373,7 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, T * info = (T *) alloca(sizeof(T) * paramValueSize); err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); if (err != CL_SUCCESS) - { + { LOG(ERROR) << "Failed to find OpenCL device info:" << str; return; } @@ -382,20 +383,20 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, { std::string deviceType; appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); + > ( + *(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); LOG(INFO) << "\t " << str << ":\t" << deviceType; } @@ -404,12 +405,12 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, { std::string memType; appendBitfield < cl_device_exec_capabilities - > ( - *(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); + > ( + *(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); appendBitfield < cl_device_exec_capabilities - > ( - *(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); + > ( + *(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); LOG(INFO) << "\t " << str << ":\t" << memType; @@ -419,10 +420,10 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, { std::string memType; appendBitfield < cl_device_exec_capabilities - > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); + > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); appendBitfield < cl_device_exec_capabilities - > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); + > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); LOG(INFO) << "\t " << str << ":\t" << memType; } @@ -434,13 +435,13 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, } -template +template void Device::appendBitfield(T info, T value, std::string name, std::string &str) - { - if (info & value) { - if (str.length() > 0) + if (info & value) { + if (str.length() > 0) + { str.append(" | "); } str.append(name); diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index 64f4fa6b..ba302ba8 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -17,7 +17,7 @@ bool InternalThread::StartInternalThread() { } try { thread_.reset( - new boost::thread(&InternalThread::InternalThreadEntry, this)); + new boost::thread(&InternalThread::InternalThreadEntry, this)); } catch (...) { return false; } diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index 4ff6e3d4..a720ee92 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -17,9 +17,9 @@ namespace caffe { // Get convolution layer according to engine. -template +template shared_ptr > GetConvolutionLayer( - const LayerParameter& param) { + const LayerParameter& param) { ConvolutionParameter_Engine engine = param.convolution_param().engine(); if (engine == ConvolutionParameter_Engine_DEFAULT) { engine = ConvolutionParameter_Engine_CAFFE; @@ -41,7 +41,7 @@ shared_ptr > GetConvolutionLayer( REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer); // Get pooling layer according to engine. -template +template shared_ptr > GetPoolingLayer(const LayerParameter& param) { PoolingParameter_Engine engine = param.pooling_param().engine(); if (engine == PoolingParameter_Engine_DEFAULT) { @@ -56,7 +56,7 @@ shared_ptr > GetPoolingLayer(const LayerParameter& param) { } else if (engine == PoolingParameter_Engine_CUDNN) { PoolingParameter p_param = param.pooling_param(); if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || - param.top_size() > 1) { + param.top_size() > 1) { LOG(INFO) << "CUDNN does not support padding or multiple tops. " << "Using Caffe's own pooling layer."; return shared_ptr >(new PoolingLayer(param)); @@ -71,7 +71,7 @@ shared_ptr > GetPoolingLayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); // Get relu layer according to engine. -template +template shared_ptr > GetReLULayer(const LayerParameter& param) { ReLUParameter_Engine engine = param.relu_param().engine(); if (engine == ReLUParameter_Engine_DEFAULT) { @@ -94,7 +94,7 @@ shared_ptr > GetReLULayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(ReLU, GetReLULayer); // Get sigmoid layer according to engine. -template +template shared_ptr > GetSigmoidLayer(const LayerParameter& param) { SigmoidParameter_Engine engine = param.sigmoid_param().engine(); if (engine == SigmoidParameter_Engine_DEFAULT) { @@ -117,7 +117,7 @@ shared_ptr > GetSigmoidLayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); // Get softmax layer according to engine. -template +template shared_ptr > GetSoftmaxLayer(const LayerParameter& param) { SoftmaxParameter_Engine engine = param.softmax_param().engine(); if (engine == SoftmaxParameter_Engine_DEFAULT) { @@ -140,7 +140,7 @@ shared_ptr > GetSoftmaxLayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer); // Get tanh layer according to engine. -template +template shared_ptr > GetTanHLayer(const LayerParameter& param) { TanHParameter_Engine engine = param.tanh_param().engine(); if (engine == TanHParameter_Engine_DEFAULT) { @@ -182,4 +182,4 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer); // Layers that use their constructor as their default creator should be // registered in their corresponding cpp files. Do not register them here. } - // namespace caffe + // namespace caffe diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index cd99296e..85faa8d3 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -6,25 +6,25 @@ namespace caffe { -template +template void AbsValLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; + "allow in-place computation."; } -template +template void AbsValLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { const int count = top[0]->count(); Dtype* top_data = top[0]->mutable_cpu_data(); caffe_abs(count, bottom[0]->cpu_data(), top_data); } -template +template void AbsValLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const int count = top[0]->count(); const Dtype* top_diff = top[0]->cpu_diff(); if (propagate_down[0]) { @@ -35,17 +35,17 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } } -template +template void AbsValLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int count = top[0]->count(); Dtype* top_data = top[0]->mutable_gpu_data(); caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); } -template +template void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const int count = top[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); if (propagate_down[0]) { diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 82f92e27..a26839d4 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -10,39 +10,39 @@ namespace caffe { -template +template void AccuracyLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { top_k_ = this->layer_param_.accuracy_param().top_k(); has_ignore_label_ = - this->layer_param_.accuracy_param().has_ignore_label(); + this->layer_param_.accuracy_param().has_ignore_label(); if (has_ignore_label_) { ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); } } -template +template void AccuracyLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) - << "top_k must be less than or equal to the number of classes."; + << "top_k must be less than or equal to the number of classes."; label_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); + bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); outer_num_ = bottom[0]->count(0, label_axis_); inner_num_ = bottom[0]->count(label_axis_ + 1); CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; + << "Number of labels must match number of predictions; " + << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; vector top_shape(0); // Accuracy is a scalar; 0 axes. top[0]->Reshape(top_shape); } -template +template void AccuracyLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype accuracy = 0; const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* bottom_label = bottom[1]->cpu_data(); @@ -54,7 +54,7 @@ void AccuracyLayer::Forward_cpu(const vector*>& bottom, for (int i = 0; i < outer_num_; ++i) { for (int j = 0; j < inner_num_; ++j) { const int label_value = - static_cast(bottom_label[i * inner_num_ + j]); + static_cast(bottom_label[i * inner_num_ + j]); if (has_ignore_label_ && label_value == ignore_label_) { continue; } @@ -64,11 +64,11 @@ void AccuracyLayer::Forward_cpu(const vector*>& bottom, std::vector < std::pair > bottom_data_vector; for (int k = 0; k < num_labels; ++k) { bottom_data_vector.push_back(std::make_pair( - bottom_data[i * dim + k * inner_num_ + j], k)); + bottom_data[i * dim + k * inner_num_ + j], k)); } std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); + bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, + bottom_data_vector.end(), std::greater >()); // check if true label is in top k predictions for (int k = 0; k < top_k_; k++) { if (bottom_data_vector[k].second == label_value) { diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index 87cc706e..235e8371 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -8,19 +8,19 @@ namespace caffe { -template +template void ArgMaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { out_max_val_ = this->layer_param_.argmax_param().out_max_val(); top_k_ = this->layer_param_.argmax_param().top_k(); CHECK_GE(top_k_, 1) << " top k must not be less than 1."; CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num()) - << "top_k must be less than or equal to the number of classes."; + << "top_k must be less than or equal to the number of classes."; } -template +template void ArgMaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (out_max_val_) { // Produces max_ind and max_val top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); @@ -30,9 +30,9 @@ void ArgMaxLayer::Reshape(const vector*>& bottom, } } -template +template void ArgMaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); int num = bottom[0]->num(); @@ -41,11 +41,11 @@ void ArgMaxLayer::Forward_cpu(const vector*>& bottom, std::vector < std::pair > bottom_data_vector; for (int j = 0; j < dim; ++j) { bottom_data_vector.push_back( - std::make_pair(bottom_data[i * dim + j], j)); + std::make_pair(bottom_data[i * dim + j], j)); } std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); + bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, + bottom_data_vector.end(), std::greater >()); for (int j = 0; j < top_k_; ++j) { top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; } diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 97c9afd3..cefa8a66 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -16,26 +16,27 @@ template cl_mem BaseConvolutionLayer::subTopMem = clCrea template cl_mem BaseConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); #endif -template -void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) - { +template +void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) { if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) { ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size; clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem); ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context, - CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, NULL, - NULL); + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, + NULL, + NULL); } if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) { ConvolutionLayer < Dtype > ::trans_mem_size = trans_size; clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem); ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context, - CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, NULL, - NULL); + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, + NULL, + NULL); } } -template +template void BaseConvolutionLayer::ocl_setup() { M_ = num_output_ / group_; K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; @@ -47,31 +48,31 @@ void BaseConvolutionLayer::ocl_setup() { #endif } -template +template BaseConvolutionLayer::~BaseConvolutionLayer() { } -template +template void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; + << "corresponding to (num, channels, height, width)"; // Configure the kernel size, padding, stride, and inputs. ConvolutionParameter conv_param = this->layer_param_.convolution_param(); CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; if (conv_param.has_kernel_size()) { kernel_h_ = kernel_w_ = conv_param.kernel_size(); } else { @@ -95,7 +96,7 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, // Special case: im2col is the identity for 1x1 convolution with stride 1 // and no padding, so flag for skipping the buffer and transformation. is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 - && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; + && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; // Configure output channels and groups. channels_ = bottom[0]->channels(); num_output_ = this->layer_param_.convolution_param().num_output(); @@ -103,7 +104,7 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, group_ = this->layer_param_.convolution_param().group(); CHECK_EQ(channels_ % group_, 0); CHECK_EQ(num_output_ % group_, 0) - << "Number of output should be multiples of group."; + << "Number of output should be multiples of group."; if (reverse_dimensions()) { conv_out_channels_ = channels_; conv_in_channels_ = num_output_; @@ -127,16 +128,16 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width this->blobs_[0].reset(new Blob( - conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); + conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( - this->layer_param_.convolution_param().weight_filler())); + this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, initialize and fill the biases. if (bias_term_) { vector bias_shape(1, num_output_); this->blobs_[1].reset(new Blob(bias_shape)); shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( - this->layer_param_.convolution_param().bias_filler())); + this->layer_param_.convolution_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } @@ -144,25 +145,25 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, this->param_propagate_down_.resize(this->blobs_.size(), true); } -template +template void BaseConvolutionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; + << "corresponding to (num, channels, height, width)"; num_ = bottom[0]->num(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" - " convolution kernel."; + " convolution kernel."; // TODO: generalize to handle inputs of different shapes. for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; CHECK_EQ(channels_, bottom[bottom_id]->channels()) - << "Inputs must have same channels."; + << "Inputs must have same channels."; CHECK_EQ(height_, bottom[bottom_id]->height()) - << "Inputs must have same height."; + << "Inputs must have same height."; CHECK_EQ(width_, bottom[bottom_id]->width()) - << "Inputs must have same width."; + << "Inputs must have same width."; } // Shape the tops. compute_output_shape(); @@ -195,15 +196,15 @@ void BaseConvolutionLayer::Reshape(const vector*>& bottom, vector bias_multiplier_shape(1, height_out_ * width_out_); bias_multiplier_.Reshape(bias_multiplier_shape); caffe_set(bias_multiplier_.count(), Dtype(1), - bias_multiplier_.mutable_cpu_data()); + bias_multiplier_.mutable_cpu_data()); } //initializa OpenCL kernels and cl_mem objects ocl_setup(); } -template +template void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* weights, Dtype* output, bool skip_im2col) { const Dtype* col_buff = input; if (!is_1x1_) { if (!skip_im2col) { @@ -213,41 +214,42 @@ void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype) 0., output + output_offset_ * g); + group_, conv_out_spatial_dim_, kernel_dim_ / group_, + (Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g, + (Dtype) 0., output + output_offset_ * g); } } -template +template void BaseConvolutionLayer::forward_cpu_bias(Dtype* output, - const Dtype* bias) { - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), - (Dtype) 1., output); + const Dtype* bias) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, + height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), + (Dtype) 1., output); } -template +template void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { + const Dtype* weights, Dtype* input) { Dtype* col_buff = col_buffer_.mutable_cpu_data(); if (is_1x1_) { col_buff = input; } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype) 0., col_buff + col_offset_ * g); + conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g, + (Dtype) 0., col_buff + col_offset_ * g); } if (!is_1x1_) { conv_col2im_cpu(col_buff, input); } } -template +template void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { + const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (!is_1x1_) { conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); @@ -255,26 +257,26 @@ void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype) 1., weights + weight_offset_ * g); + > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, + kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g, + (Dtype) 1., weights + weight_offset_ * g); } } -template +template void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias, - const Dtype* input) { + const Dtype* input) { caffe_cpu_gemv < Dtype - > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.cpu_data(), 1., bias); + > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., + input, bias_multiplier_.cpu_data(), 1., bias); } #ifndef CPU_ONLY -template +template void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* weights, Dtype* output, bool skip_im2col) { const Dtype* col_buff = input; if (!is_1x1_) { if (!skip_im2col) { @@ -285,17 +287,17 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, - conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ - / group_, - (Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g, - (Dtype) 0., output, top_offset_ + output_offset_ * g); + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, + conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ + / group_, + (Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g, + (Dtype) 0., output, top_offset_ + output_offset_ * g); } } -template +template void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, - const Dtype* weight, Dtype* output, bool skip_im2col) { + const Dtype* weight, Dtype* output, bool skip_im2col) { cl_command_queue Queue; const Dtype* col_buff = input; if (!is_1x1_) { @@ -305,15 +307,15 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, col_buff = col_buffer_.gpu_data(); } else { caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, - (Dtype*) transMem); + (Dtype*) transMem); } #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; else Queue = amdDevice.CommandQueue_helper; caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); } if(group_ == 2) { clFinish(amdDevice.CommandQueue); @@ -323,63 +325,63 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, Queue = amdDevice.CommandQueue; for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype - > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_ - * g, - (Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g); + > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_ + * g, + (Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g); } #endif transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_, - opt_num2); + opt_num2); } -template +template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, - const Dtype* bias) { + const Dtype* bias) { caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype) 1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype) 1., output, top_offset_); + height_out_ * width_out_, 1, (Dtype) 1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype) 1., output, top_offset_); } -template +template void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, - const Dtype* bias) { + const Dtype* bias) { for (int z = 0; z < opt_num2; z++) caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype) 1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype) 1., output, top_offset_ + num_output_ * N_ * z); + N_, 1, (Dtype) 1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype) 1., output, top_offset_ + num_output_ * N_ * z); } -template +template void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { + const Dtype* weights, Dtype* input) { Dtype* col_buff = col_buffer_.mutable_gpu_data(); if (is_1x1_) { col_buff = input; } for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ - / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype) 1., weights, weight_offset_ * g, - output, top_offset_ + output_offset_ * g, - (Dtype) 0., col_buff, col_offset_ * g); + > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype) 1., weights, weight_offset_ * g, + output, top_offset_ + output_offset_ * g, + (Dtype) 0., col_buff, col_offset_ * g); } if (!is_1x1_) { conv_col2im_gpu(col_buff, input); } } -template +template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, - const Dtype* weights, Dtype* input) { + const Dtype* weights, Dtype* input) { cl_command_queue Queue; if (is_1x1_) { caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, + (Dtype*) transMem); } for (int g = 0; g < group_; ++g) { #ifdef multiQ @@ -389,10 +391,10 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, Queue = amdDevice.CommandQueue; #endif caffe_gpu_gemm < Dtype - > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, - (Dtype) 1., weights, weight_offset_ * g, - (Dtype*) subTopMem, top_offset_opt * g, - (Dtype) 0., (Dtype*) transMem, col_offset_ * g); + > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, + (Dtype) 1., weights, weight_offset_ * g, + (Dtype*) subTopMem, top_offset_opt * g, + (Dtype) 0., (Dtype*) transMem, col_offset_ * g); } #ifdef multiQ if(group_ ==2) { @@ -405,14 +407,14 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, conv_col2im_gpu_opt(input); } else { caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), - (Dtype*) transMem, input); + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), + (Dtype*) transMem, input); } } -template +template void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { + const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (!is_1x1_) { conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); @@ -420,25 +422,25 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, } for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ - / group_, kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype) 1., output, top_offset_, - (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., - (Dtype*) weights, weight_offset_ * g); + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype) 1., output, top_offset_, + (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., + (Dtype*) weights, weight_offset_ * g); } } -template +template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, - const Dtype* output, Dtype* weights) { + const Dtype* output, Dtype* weights) { cl_command_queue Queue; if (!is_1x1_) { conv_im2col_gpu_opt(input); } else { caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); + (Dtype*) transMem); } opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, - opt_num2); + opt_num2); for (int g = 0; g < group_; ++g) { #ifdef multiQ @@ -448,10 +450,10 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, Queue = amdDevice.CommandQueue; #endif caffe_gpu_gemm < Dtype - > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, - (Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g, - (Dtype*) transMem, col_offset_ * g, (Dtype) 1., - (Dtype*) weights, weight_offset_ * g); + > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, + (Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g, + (Dtype*) transMem, col_offset_ * g, (Dtype) 1., + (Dtype*) weights, weight_offset_ * g); #ifdef multiQ if(group_ == 2) { clFinish(amdDevice.CommandQueue); @@ -461,14 +463,14 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, } } -template +template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, - const Dtype* input) { + const Dtype* input) { caffe_gpu_gemv < Dtype - > (CblasNoTrans, num_output_, N_, - (Dtype) 1., input, top_offset_, N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, - bias, (size_t) 0, 1); + > (CblasNoTrans, num_output_, N_, + (Dtype) 1., input, top_offset_, N_, + reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, + bias, (size_t) 0, 1); } #endif // !CPU_ONLY diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index f9a80979..b0c0ebf2 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -7,30 +7,31 @@ namespace caffe { -template +template BaseDataLayer::BaseDataLayer(const LayerParameter& param) - : Layer(param), - transform_param_(param.transform_param()) { +: + Layer(param), + transform_param_(param.transform_param()) { } -template +template void BaseDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (top.size() == 1) { output_labels_ = false; } else { output_labels_ = true; } data_transformer_.reset( - new DataTransformer(transform_param_, this->phase_)); + new DataTransformer(transform_param_, this->phase_)); data_transformer_->InitRand(); // The subclasses should setup the size of bottom and top DataLayerSetUp(bottom, top); } -template +template void BasePrefetchingDataLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { BaseDataLayer < Dtype > ::LayerSetUp(bottom, top); // Now, start the prefetch thread. Before calling prefetch, we make two // cpu_data calls so that the prefetch thread does not accidentally make @@ -45,20 +46,20 @@ void BasePrefetchingDataLayer::LayerSetUp( DLOG(INFO) << "Prefetch initialized."; } -template +template void BasePrefetchingDataLayer::CreatePrefetchThread() { this->data_transformer_->InitRand(); CHECK(StartInternalThread()) << "Thread execution failed"; } -template +template void BasePrefetchingDataLayer::JoinPrefetchThread() { CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; } -template +template void BasePrefetchingDataLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { // First, join the thread JoinPrefetchThread(); @@ -67,43 +68,44 @@ void BasePrefetchingDataLayer::Forward_cpu( top[0]->ReshapeLike(prefetch_data_); // Copy the data caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_cpu_data()); + top[0]->mutable_cpu_data()); DLOG(INFO) << "Prefetch copied"; if (this->output_labels_) { // Reshape to loaded labels. top[1]->ReshapeLike(prefetch_label_); // Copy the labels. caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_cpu_data()); + top[1]->mutable_cpu_data()); } // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; CreatePrefetchThread(); } -template +template void BasePrefetchingDataLayer::Forward_gpu( - const vector*>& bottom, - const vector*>& top) { + const vector*>& bottom, + const vector*>& top) { JoinPrefetchThread(); DLOG(INFO) << "Thread joined"; top[0]->ReshapeLike(this->prefetch_data_); OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, - sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, - NULL, NULL)); + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, + NULL, NULL)); DLOG(INFO) << "Prefetch copied"; if (this->output_labels_) { // Reshape to loaded labels. top[1]->ReshapeLike(prefetch_label_); OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, - sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), 0, - NULL, NULL)); + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), + 0, + NULL, NULL)); } #ifdef Track_data_transfer diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 8f72f41b..11b78a15 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -8,24 +8,24 @@ namespace caffe { const float kBNLL_THRESHOLD = 50.; -template +template void BNLLLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = - bottom_data[i] > 0 ? - bottom_data[i] + log(1. + exp(-bottom_data[i])) : - log(1. + exp(bottom_data[i])); + bottom_data[i] > 0 ? + bottom_data[i] + log(1. + exp(-bottom_data[i])) : + log(1. + exp(bottom_data[i])); } } -template +template void BNLLLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -39,9 +39,9 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } } -template +template void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -49,10 +49,10 @@ void BNLLLayer::Forward_gpu(const vector*>& bottom, BNLLForward(count, bottom_data, top_data); } -template +template void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index b885d9e6..7d55ef40 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -6,17 +6,17 @@ namespace caffe { -template +template void ConcatLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const ConcatParameter& concat_param = this->layer_param_.concat_param(); CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) - << "Either axis or concat_dim should be specified; not both."; + << "Either axis or concat_dim should be specified; not both."; } -template +template void ConcatLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_axes = bottom[0]->num_axes(); const ConcatParameter& concat_param = this->layer_param_.concat_param(); if (concat_param.has_concat_dim()) { @@ -24,8 +24,8 @@ void ConcatLayer::Reshape(const vector*>& bottom, // Don't allow negative indexing for concat_dim, a uint32 -- almost // certainly unintended. CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 " - << "produced negative result; concat_dim must satisfy " - << "0 <= concat_dim < " << kMaxBlobAxes; + << "produced negative result; concat_dim must satisfy " + << "0 <= concat_dim < " << kMaxBlobAxes; CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range."; } else { concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis()); @@ -37,13 +37,13 @@ void ConcatLayer::Reshape(const vector*>& bottom, int bottom_count_sum = bottom[0]->count(); for (int i = 1; i < bottom.size(); ++i) { CHECK_EQ(num_axes, bottom[i]->num_axes()) - << "All inputs must have the same #axes."; + << "All inputs must have the same #axes."; for (int j = 0; j < num_axes; ++j) { if (j == concat_axis_) { continue; } CHECK_EQ(top_shape[j], bottom[i]->shape(j)) - << "All inputs must have the same shape, except at concat_axis."; + << "All inputs must have the same shape, except at concat_axis."; } bottom_count_sum += bottom[i]->count(); top_shape[concat_axis_] += bottom[i]->shape(concat_axis_); @@ -52,9 +52,9 @@ void ConcatLayer::Reshape(const vector*>& bottom, CHECK_EQ(bottom_count_sum, top[0]->count()); } -template +template void ConcatLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype* top_data = top[0]->mutable_cpu_data(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); @@ -63,17 +63,17 @@ void ConcatLayer::Forward_cpu(const vector*>& bottom, const int bottom_concat_axis = bottom[i]->shape(concat_axis_); for (int n = 0; n < num_concats_; ++n) { caffe_copy(bottom_concat_axis * concat_input_size_, - bottom_data + n * bottom_concat_axis * concat_input_size_, - top_data + (n * top_concat_axis + offset_concat_axis) - * concat_input_size_); + bottom_data + n * bottom_concat_axis * concat_input_size_, + top_data + (n * top_concat_axis + offset_concat_axis) + * concat_input_size_); } offset_concat_axis += bottom_concat_axis; } } -template +template void ConcatLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); @@ -85,16 +85,16 @@ void ConcatLayer::Backward_cpu(const vector*>& top, const int bottom_concat_axis = bottom[i]->shape(concat_axis_); for (int n = 0; n < num_concats_; ++n) { caffe_copy(bottom_concat_axis * concat_input_size_, top_diff + - (n * top_concat_axis + offset_concat_axis) * concat_input_size_, - bottom_diff + n * bottom_concat_axis * concat_input_size_); + (n * top_concat_axis + offset_concat_axis) * concat_input_size_, + bottom_diff + n * bottom_concat_axis * concat_input_size_); } offset_concat_axis += bottom_concat_axis; } } -template +template void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (bottom.size() == 1) { return; } @@ -108,14 +108,14 @@ void ConcatLayer::Forward_gpu(const vector*>& bottom, const int bottom_concat_size = bottom_concat_axis * concat_input_size_; const int nthreads = bottom_concat_size * num_concats_; Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); + top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); offset_concat_axis += bottom_concat_axis; } } -template +template void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (bottom.size() == 1) { return; } @@ -130,7 +130,7 @@ void ConcatLayer::Backward_gpu(const vector*>& top, const int bottom_concat_size = bottom_concat_axis * concat_input_size_; const int nthreads = bottom_concat_size * num_concats_; Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); } offset_concat_axis += bottom_concat_axis; } diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 9c3f38d5..6a91fdfd 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -8,9 +8,9 @@ namespace caffe { -template +template void ContrastiveLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::LayerSetUp(bottom, top); CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); CHECK_EQ(bottom[0]->height(), 1); @@ -29,24 +29,24 @@ void ContrastiveLossLayer::LayerSetUp( summer_vec_.mutable_cpu_data()[i] = Dtype(1); } -template +template void ContrastiveLossLayer::Forward_cpu( - const vector*>& bottom, - const vector*>& top) { + const vector*>& bottom, + const vector*>& top) { int count = bottom[0]->count(); caffe_sub( - count, - bottom[0]->cpu_data(), // a - bottom[1]->cpu_data(), // b - diff_.mutable_cpu_data()); // a_i-b_i + count, + bottom[0]->cpu_data(), // a + bottom[1]->cpu_data(), // b + diff_.mutable_cpu_data()); // a_i-b_i const int channels = bottom[0]->channels(); Dtype margin = this->layer_param_.contrastive_loss_param().margin(); bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); + this->layer_param_.contrastive_loss_param().legacy_version(); Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, - diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); + diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs loss += dist_sq_.cpu_data()[i]; } else { // dissimilar pairs @@ -62,28 +62,28 @@ void ContrastiveLossLayer::Forward_cpu( top[0]->mutable_cpu_data()[0] = loss; } -template +template void ContrastiveLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { Dtype margin = this->layer_param_.contrastive_loss_param().margin(); bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); + this->layer_param_.contrastive_loss_param().legacy_version(); for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[i]->num()); + static_cast(bottom[i]->num()); int num = bottom[i]->num(); int channels = bottom[i]->channels(); for (int j = 0; j < num; ++j) { Dtype* bout = bottom[i]->mutable_cpu_diff(); if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs caffe_cpu_axpby( - channels, - alpha, - diff_.cpu_data() + (j * channels), - Dtype(0.0), - bout + (j * channels)); + channels, + alpha, + diff_.cpu_data() + (j * channels), + Dtype(0.0), + bout + (j * channels)); } else { // dissimilar pairs Dtype mdist(0.0); Dtype beta(0.0); @@ -97,11 +97,11 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } if (mdist > Dtype(0.0)) { caffe_cpu_axpby( - channels, - beta, - diff_.cpu_data() + (j * channels), - Dtype(0.0), - bout + (j * channels)); + channels, + beta, + diff_.cpu_data() + (j * channels), + Dtype(0.0), + bout + (j * channels)); } else { caffe_set(channels, Dtype(0), bout + (j * channels)); } @@ -111,32 +111,32 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } } -template +template void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { const int count = bottom[0]->count(); caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i + count, + bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + count, + diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), + diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 + CblasNoTrans, + bottom[0]->num(), + bottom[0]->channels(), + Dtype(1.0), + diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), + Dtype(0.0), + dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 Dtype margin = this->layer_param_.contrastive_loss_param().margin(); bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); + this->layer_param_.contrastive_loss_param().legacy_version(); Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs @@ -154,25 +154,25 @@ void ContrastiveLossLayer::Forward_gpu( top[0]->mutable_cpu_data()[0] = loss; } -template +template void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const int count = bottom[0]->count(); const int channels = bottom[0]->channels(); Dtype margin = this->layer_param_.contrastive_loss_param().margin(); const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); + this->layer_param_.contrastive_loss_param().legacy_version(); const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); + static_cast(bottom[0]->num()); // NOLINT_NEXT_LINE(whitespace/operators) CLLBackward(count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); + bottom[2]->gpu_data(), // pair similarity 0 or 1 + diff_.gpu_data(), // the cached eltwise difference between a and b + dist_sq_.gpu_data(), // the cached square distance between a and b + bottom[i]->mutable_gpu_diff()); } } } diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index d5ffdb9f..bbe07f37 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -7,24 +7,24 @@ namespace caffe { -template +template void ConvolutionLayer::compute_output_shape() { this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_) - / this->stride_h_ + 1; + / this->stride_h_ + 1; this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_) - / this->stride_w_ + 1; + / this->stride_w_ + 1; } -template +template void ConvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = top[i]->mutable_cpu_data(); for (int n = 0; n < this->num_; ++n) { this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); + top_data + top[i]->offset(n)); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->cpu_data(); this->forward_cpu_bias(top_data + top[i]->offset(n), bias); @@ -35,9 +35,9 @@ void ConvolutionLayer::Forward_cpu(const vector*>& bottom, // CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template +template void ConvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -56,12 +56,12 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); + top_diff + top[i]->offset(n), weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); + bottom_diff + bottom[i]->offset(n)); } } } @@ -69,28 +69,28 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } -template +template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (use_packing_scheme && global_packing_N > 1) Forward_gpu_opt2(bottom, top); else Forward_gpu_org(bottom, top); } -template +template void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N > 1) Backward_gpu_opt2(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); } -template +template void ConvolutionLayer::Forward_gpu_opt2( - const vector*>& bottom, - const vector*>& top) { + const vector*>& bottom, + const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); @@ -101,14 +101,14 @@ void ConvolutionLayer::Forward_gpu_opt2( this->weight_offset_ = this->M_ * this->K_; for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = - this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; //intermediate variables to pass offset this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; this->top_offset_ = top[i]->offset(n); this->col_offset_ = this->K_ * this->N_ * this->opt_num2; this->bottom_offset_ = bottom[i]->offset(n); this->forward_gpu_gemm_opt(bottom_data, weight, - top_data); + top_data); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->gpu_data(); this->forward_gpu_bias_opt(top_data, bias); @@ -121,10 +121,10 @@ void ConvolutionLayer::Forward_gpu_opt2( } -template +template void ConvolutionLayer::Forward_gpu_org( - const vector*>& bottom, - const vector*>& top) { + const vector*>& bottom, + const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); @@ -136,7 +136,7 @@ void ConvolutionLayer::Forward_gpu_org( this->bottom_offset_ = bottom[i]->offset(n); this->top_offset_ = top[i]->offset(n); this->forward_gpu_gemm(bottom_data, weight, - top_data); + top_data); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->gpu_data(); @@ -149,9 +149,9 @@ void ConvolutionLayer::Forward_gpu_org( //CHECK_BLOB_DATA(top[0],20, "top[0]"); } -template +template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -173,7 +173,9 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, this->opt_num2 = global_packing_N; for (int n = 0; n < this->num_; n += this->opt_num2) { this->opt_num2 = - this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + this->opt_num2 > (this->num_ - n) ? + (this->num_ - n) : + this->opt_num2; this->top_offset_ = top[i]->offset(n); this->bottom_offset_ = bottom[i]->offset(n); this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); @@ -181,21 +183,21 @@ void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm_opt(bottom_data, - top_diff, weight_diff); + top_diff, weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm_opt(top_diff, weight, - bottom_diff); + bottom_diff); } } } } } -template +template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -220,12 +222,12 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm(bottom_data, - top_diff, weight_diff); + top_diff, weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_gpu_gemm(top_diff, weight, - bottom_diff); + bottom_diff); } } } diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index bff8b10c..e9ee5221 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -16,14 +16,14 @@ namespace caffe { -template +template DataLayer::~DataLayer() { this->JoinPrefetchThread(); } -template +template void DataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Initialize DB db_.reset(db::GetDB(this->layer_param_.data_param().backend())); db_->Open(this->layer_param_.data_param().source(), db::READ); @@ -32,7 +32,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, // Check if we should randomly skip a few data points if (this->layer_param_.data_param().rand_skip()) { unsigned int skip = caffe_rng_rand() % - this->layer_param_.data_param().rand_skip(); + this->layer_param_.data_param().rand_skip(); LOG(INFO) << "Skipping first " << skip << " data points."; while (skip-- > 0) { cursor_->Next(); @@ -51,8 +51,8 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, this->prefetch_data_.set_data_layer(); LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); // label if (this->output_labels_) { vector label_shape(1, this->layer_param_.data_param().batch_size()); @@ -63,7 +63,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, } // This function is used to create a thread that prefetches the data. -template +template void DataLayer::InternalThreadEntry() { CPUTimer batch_timer; batch_timer.Start(); diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index aa61a755..402a787e 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -8,24 +8,24 @@ namespace caffe { -template +template void DeconvolutionLayer::compute_output_shape() { this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_ - - 2 * this->pad_h_; + - 2 * this->pad_h_; this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_ - - 2 * this->pad_w_; + - 2 * this->pad_w_; } -template +template void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = top[i]->mutable_cpu_data(); for (int n = 0; n < this->num_; ++n) { this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); + top_data + top[i]->offset(n)); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->cpu_data(); this->forward_cpu_bias(top_data + top[i]->offset(n), bias); @@ -34,9 +34,9 @@ void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, } } -template +template void DeconvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -55,23 +55,23 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, // Gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_cpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); + bottom_data + bottom[i]->offset(n), weight_diff); } // Gradient w.r.t. bottom data, if necessary, reusing the column buffer // we might have just computed above. if (propagate_down[i]) { this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n), - this->param_propagate_down_[0]); + bottom_diff + bottom[i]->offset(n), + this->param_propagate_down_[0]); } } } } } -template +template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); @@ -88,9 +88,9 @@ void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, } } -template +template void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -113,12 +113,12 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); + bottom_data + bottom[i]->offset(n), weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); + bottom_diff + bottom[i]->offset(n)); } } } diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index ae045c5c..c84c8622 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -10,19 +10,18 @@ namespace caffe { -template +template void DropoutLayer::ocl_setup(int bottom_count) { MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - bottom_count * sizeof(int), NULL, NULL); + bottom_count * sizeof(int), NULL, NULL); } -template +template DropoutLayer::~DropoutLayer() { OCL_CHECK (clReleaseMemObject(MaskMem) ); - } -template + }template void DropoutLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); threshold_ = this->layer_param_.dropout_param().dropout_ratio(); DCHECK(threshold_ > 0.); @@ -32,18 +31,18 @@ void DropoutLayer::LayerSetUp(const vector*>& bottom, ocl_setup(bottom[0]->count()); } -template +template void DropoutLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::Reshape(bottom, top); // Set up the cache for random number generation rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); } -template +template void DropoutLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); unsigned int* mask = rand_vec_.mutable_cpu_data(); @@ -59,10 +58,10 @@ void DropoutLayer::Forward_cpu(const vector*>& bottom, } } -template +template void DropoutLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -78,9 +77,9 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } -template +template void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -95,26 +94,26 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1., - threshold_); + threshold_); DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_, - top_data); + top_data); #endif } else { caffe_gpu_copy(count, bottom_data, top_data); } } -template +template void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); if (this->phase_ == TRAIN) { const int count = bottom[0]->count(); DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_, - (Dtype) scale_, bottom_diff); + (Dtype) scale_, bottom_diff); } else { caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); } diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index 8a3fe17e..a5225ea6 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -6,39 +6,39 @@ namespace caffe { -template +template void DummyDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_top = top.size(); const DummyDataParameter& param = this->layer_param_.dummy_data_param(); const int num_data_filler = param.data_filler_size(); CHECK(num_data_filler == 0 || num_data_filler == 1 || - num_data_filler == num_top) - << "Number of data fillers must be 0, 1 or equal to the number of tops: " - << num_top << "; you specified " << num_data_filler << " data fillers."; + num_data_filler == num_top) + << "Number of data fillers must be 0, 1 or equal to the number of tops: " + << num_top << "; you specified " << num_data_filler << " data fillers."; const bool legacy_dims = param.num_size() || param.channels_size() || - param.height_size() || param.width_size(); + param.height_size() || param.width_size(); if (legacy_dims) { CHECK_EQ(0, param.shape_size()) - << "Both shape and legacy fields were specified"; + << "Both shape and legacy fields were specified"; // Using deprecated 4D output dim specifiers. CHECK(param.num_size() == 1 || param.num_size() == num_top) - << "Must specify 'num' once, or once per top blob " - << "(" << num_top << "); specified " << param.num_size() << "."; + << "Must specify 'num' once, or once per top blob " + << "(" << num_top << "); specified " << param.num_size() << "."; CHECK(param.channels_size() == 1 || param.channels_size() == num_top) - << "Must specify 'channels' once, or once per top blob " - << "(" << num_top << "); specified " << param.channels_size() << "."; + << "Must specify 'channels' once, or once per top blob " + << "(" << num_top << "); specified " << param.channels_size() << "."; CHECK(param.height_size() == 1 || param.height_size() == num_top) - << "Must specify 'height' once, or once per top blob " - << "(" << num_top << "); specified " << param.height_size() << "."; + << "Must specify 'height' once, or once per top blob " + << "(" << num_top << "); specified " << param.height_size() << "."; CHECK(param.width_size() == 1 || param.width_size() == num_top) - << "Must specify 'width' once, or once per top blob " - << "(" << num_top << "); specified " << param.width_size() << "."; + << "Must specify 'width' once, or once per top blob " + << "(" << num_top << "); specified " << param.width_size() << "."; } else { CHECK(param.shape_size() == 1 || param.shape_size() == num_top) - << "Must specify 'shape' once, or once per top blob " - << "(" << num_top << "); specified " << param.shape_size() << "."; + << "Must specify 'shape' once, or once per top blob " + << "(" << num_top << "); specified " << param.shape_size() << "."; } // refill_[i] tells Forward i whether or not to actually refill top Blob i. // If refill_[i] is false, Forward does nothing for Blob i. We use this to @@ -71,18 +71,18 @@ void DummyDataLayer::LayerSetUp(const vector*>& bottom, // Refill on each iteration iff not using a constant filler, // but use the inverse of this rule for the first run. refill_[i] = - (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); + (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); } } for (int i = 0; i < num_top; ++i) { if (legacy_dims) { const int num = (param.num_size() == 1) ? param.num(0) : param.num(i); const int channels = - (param.channels_size() == 1) ? param.channels(0) : param.channels(i); + (param.channels_size() == 1) ? param.channels(0) : param.channels(i); const int height = - (param.height_size() == 1) ? param.height(0) : param.height(i); + (param.height_size() == 1) ? param.height(0) : param.height(i); const int width = - (param.width_size() == 1) ? param.width(0) : param.width(i); + (param.width_size() == 1) ? param.width(0) : param.width(i); top[i]->Reshape(num, channels, height, width); } else { const int shape_index = (param.shape_size() == 1) ? 0 : i; @@ -98,9 +98,9 @@ void DummyDataLayer::LayerSetUp(const vector*>& bottom, } } -template +template void DummyDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 0; i < top.size(); ++i) { const int filler_id = (fillers_.size() > 1) ? i : 0; if (refill_[filler_id]) { diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 45126d44..e7b97b0d 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -7,16 +7,16 @@ namespace caffe { -template +template void EltwiseLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK(this->layer_param().eltwise_param().coeff_size() == 0 - || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << - "Eltwise Layer takes one coefficient per bottom blob."; + || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << + "Eltwise Layer takes one coefficient per bottom blob."; CHECK(!(this->layer_param().eltwise_param().operation() - == EltwiseParameter_EltwiseOp_PROD - && this->layer_param().eltwise_param().coeff_size())) << - "Eltwise layer only takes coefficients for summation."; + == EltwiseParameter_EltwiseOp_PROD + && this->layer_param().eltwise_param().coeff_size())) << + "Eltwise layer only takes coefficients for summation."; op_ = this->layer_param_.eltwise_param().operation(); // Blob-wise coefficients for the elementwise operation. coeffs_ = vector < Dtype > (bottom.size(), 1); @@ -28,23 +28,23 @@ void EltwiseLayer::LayerSetUp(const vector*>& bottom, stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad(); } -template +template void EltwiseLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 1; i < bottom.size(); ++i) { CHECK(bottom[i]->shape() == bottom[0]->shape()); } top[0]->ReshapeLike(*bottom[0]); // If max operation, we will initialize the vector index part. if (this->layer_param_.eltwise_param().operation() == - EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { + EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { max_idx_.Reshape(bottom[0]->shape()); } } -template +template void EltwiseLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { int* mask = NULL; const Dtype* bottom_data_a = NULL; const Dtype* bottom_data_b = NULL; @@ -97,9 +97,9 @@ void EltwiseLayer::Forward_cpu( } } -template +template void EltwiseLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const int* mask = NULL; const int count = top[0]->count(); const Dtype* top_data = top[0]->cpu_data(); @@ -121,7 +121,7 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, initialized = true; } else { caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, - bottom_diff); + bottom_diff); } } } else { @@ -153,16 +153,16 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } } -template +template void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int* mask = NULL; const int count = top[0]->count(); Dtype* top_data = top[0]->mutable_gpu_data(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); + top_data); for (int i = 2; i < bottom.size(); ++i) { caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); } @@ -178,11 +178,11 @@ void EltwiseLayer::Forward_gpu(const vector*>& bottom, mask = max_idx_.mutable_gpu_data(); // NOLINT_NEXT_LINE(whitespace/operators) MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, - top_data, mask); + top_data, mask); for (int i = 2; i < bottom.size(); ++i) { // NOLINT_NEXT_LINE(whitespace/operators) MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, - mask); + mask); } break; default: @@ -190,9 +190,9 @@ void EltwiseLayer::Forward_gpu(const vector*>& bottom, } } -template +template void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const int* mask = NULL; const int count = top[0]->count(); const Dtype* top_data = top[0]->gpu_data(); @@ -214,7 +214,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, initialized = true; } else { caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); + bottom_diff); } } } else { diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index d5abc23f..56dc48ec 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -7,74 +7,74 @@ namespace caffe { -template +template void EuclideanLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::Reshape(bottom, top); CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) - << "Inputs must have the same dimension."; + << "Inputs must have the same dimension."; diff_.ReshapeLike(*bottom[0]); } -template +template void EuclideanLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int count = bottom[0]->count(); caffe_sub( - count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), - diff_.mutable_cpu_data()); + count, + bottom[0]->cpu_data(), + bottom[1]->cpu_data(), + diff_.mutable_cpu_data()); Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); Dtype loss = dot / bottom[0]->num() / Dtype(2); top[0]->mutable_cpu_data()[0] = loss; } -template +template void EuclideanLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); caffe_cpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.cpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_cpu_diff()); // b + bottom[i]->count(), // count + alpha, // alpha + diff_.cpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_cpu_diff()); // b } } } -template +template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int count = bottom[0]->count(); caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); + count, + bottom[0]->gpu_data(), + bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); Dtype dot; caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); Dtype loss = dot / bottom[0]->num() / Dtype(2); top[0]->mutable_cpu_data()[0] = loss; } -template +template void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b + bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b } } } diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 8451b133..bf783786 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -7,9 +7,9 @@ namespace caffe { -template +template void ExpLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); const Dtype base = this->layer_param_.exp_param().base(); if (base != Dtype(-1)) { @@ -19,18 +19,18 @@ void ExpLayer::LayerSetUp(const vector*>& bottom, // Otherwise, calculate its log explicitly. const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; + << "NaN result: log(base) = log(" << base << ") = " << log_base; CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; + << "Inf result: log(base) = log(" << base << ") = " << log_base; const Dtype input_scale = this->layer_param_.exp_param().scale(); const Dtype input_shift = this->layer_param_.exp_param().shift(); inner_scale_ = log_base * input_scale; outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift); } -template +template void ExpLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); @@ -45,9 +45,9 @@ void ExpLayer::Forward_cpu(const vector*>& bottom, } } -template +template void ExpLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -61,9 +61,9 @@ void ExpLayer::Backward_cpu(const vector*>& top, } } -template +template void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); @@ -78,9 +78,9 @@ void ExpLayer::Forward_gpu(const vector*>& bottom, } } -template +template void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index 9fa26c80..f7096a09 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -7,26 +7,26 @@ namespace caffe { -template +template void FilterLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(top.size(), bottom.size() - 1); first_reshape_ = true; } -template +template void FilterLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // bottom[0...k-1] are the blobs to filter // bottom[last] is the "selector_blob" int selector_index = bottom.size() - 1; for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) { CHECK_EQ(bottom[selector_index]->shape(i), 1) - << "Selector blob dimensions must be singletons (1), except the first"; + << "Selector blob dimensions must be singletons (1), except the first"; } for (int i = 0; i < bottom.size() - 1; ++i) { CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) << - "Each bottom should have the same 0th dimension as the selector blob"; + "Each bottom should have the same 0th dimension as the selector blob"; } const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); @@ -59,9 +59,9 @@ void FilterLayer::Reshape(const vector*>& bottom, } } -template +template void FilterLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int new_tops_num = indices_to_forward_.size(); // forward all filtered items for all bottoms but the Selector (bottom[last]) for (int t = 0; t < top.size(); ++t) { @@ -72,17 +72,17 @@ void FilterLayer::Forward_cpu(const vector*>& bottom, int data_offset_top = n * dim; int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1); caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); + top_data + data_offset_top); } } } -template +template void FilterLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[bottom.size() - 1]) { LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; + << "Layer cannot backpropagate to filter index inputs"; } for (int i = 0; i < top.size(); i++) { // bottom[last] is the selector and never needs backpropagation @@ -99,17 +99,17 @@ void FilterLayer::Backward_cpu(const vector*>& top, // we already visited all items that were been forwarded, so // just set to zero remaining ones caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); + bottom[i]->mutable_cpu_diff() + data_offset_bottom); } else { batch_offset = indices_to_forward_[next_to_backward_offset]; if (n != batch_offset) { // this data was not been forwarded caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); + bottom[i]->mutable_cpu_diff() + data_offset_bottom); } else { // this data was been forwarded data_offset_top = next_to_backward_offset * dim; next_to_backward_offset++; // point to next forwarded item index caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top, - bottom[i]->mutable_cpu_diff() + data_offset_bottom); + bottom[i]->mutable_cpu_diff() + data_offset_bottom); } } } @@ -117,9 +117,9 @@ void FilterLayer::Backward_cpu(const vector*>& top, } } -template +template void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int new_tops_num = indices_to_forward_.size(); // forward all filtered items for all bottoms but the Selector (bottom[last]) for (int t = 0; t < top.size(); ++t) { @@ -130,17 +130,17 @@ void FilterLayer::Forward_gpu(const vector*>& bottom, int data_offset_top = n * dim; int data_offset_bottom = indices_to_forward_[n] * dim; caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); + top_data + data_offset_top); } } } -template +template void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[bottom.size() - 1]) { LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; + << "Layer cannot backpropagate to filter index inputs"; } for (int i = 0; i < top.size(); ++i) { // bottom[last] is the selector and never needs backpropagation @@ -157,18 +157,18 @@ void FilterLayer::Backward_gpu(const vector*>& top, // just set to zero remaining ones data_offset_bottom = n * dim; caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); + bottom[i]->mutable_gpu_diff() + data_offset_bottom); } else { batch_offset = indices_to_forward_[next_to_backward_offset]; data_offset_bottom = n * dim; if (n != batch_offset) { // this data was not been forwarded caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); + bottom[i]->mutable_gpu_diff() + data_offset_bottom); } else { // this data was been forwarded data_offset_top = next_to_backward_offset * dim; ++next_to_backward_offset; // point to next forwarded item index caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); + bottom[i]->mutable_gpu_diff() + data_offset_bottom); } } } diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index 4aaad3a4..e79e9406 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -6,13 +6,13 @@ namespace caffe { -template +template void FlattenLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int start_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().axis()); + this->layer_param_.flatten_param().axis()); const int end_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().end_axis()); + this->layer_param_.flatten_param().end_axis()); vector top_shape; for (int i = 0; i < start_axis; ++i) { top_shape.push_back(bottom[0]->shape(i)); @@ -26,15 +26,15 @@ void FlattenLayer::Reshape(const vector*>& bottom, CHECK_EQ(top[0]->count(), bottom[0]->count()); } -template +template void FlattenLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { top[0]->ShareData(*bottom[0]); } -template +template void FlattenLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { bottom[0]->ShareDiff(*top[0]); } diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 377755b9..6f67dc06 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -20,12 +20,12 @@ namespace caffe { -template +template HDF5DataLayer::~HDF5DataLayer() { } // Load data and label from HDF5 filename into the class property blobs. -template +template void HDF5DataLayer::LoadHDF5FileData(const char* filename) { DLOG(INFO) << "Loading HDF5 file: " << filename; hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT); @@ -42,7 +42,7 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { for (int i = 0; i < top_size; ++i) { hdf_blobs_[i] = shared_ptr < Blob > (new Blob()); hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), - MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); + MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); } herr_t status = H5Fclose(file_id); @@ -64,18 +64,18 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) - << " rows (shuffled)"; + << " rows (shuffled)"; } else { DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; } } -template +template void HDF5DataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Refuse transformation parameters since HDF5 is totally generic. CHECK(!this->layer_param_.has_transform_param()) << - this->type() << " does not transform data."; + this->type() << " does not transform data."; // Read the source to parse the filenames. const string& source = this->layer_param_.hdf5_data_param().source(); LOG(INFO) << "Loading list of HDF5 filenames from: " << source; @@ -94,7 +94,7 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, current_file_ = 0; LOG(INFO) << "Number of HDF5 files: " << num_files_; CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " - << source; + << source; file_permutation_.clear(); file_permutation_.resize(num_files_); @@ -126,9 +126,9 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, } } -template +template void HDF5DataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); for (int i = 0; i < batch_size; ++i, ++current_row_) { if (current_row_ == hdf_blobs_[0]->shape(0)) { @@ -138,12 +138,12 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, current_file_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); + file_permutation_.end()); } DLOG(INFO) << "Looping around to first file."; } LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); + hdf_filenames_[file_permutation_[current_file_]].c_str()); } current_row_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) @@ -152,15 +152,15 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->shape(0); caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); } } } -template +template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); for (int i = 0; i < batch_size; ++i, ++current_row_) { if (current_row_ == hdf_blobs_[0]->shape(0)) { @@ -170,12 +170,12 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, current_file_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); + file_permutation_.end()); } DLOG(INFO) << "Looping around to first file."; } LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); + hdf_filenames_[file_permutation_[current_file_]].c_str()); } current_row_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) @@ -184,11 +184,12 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->shape(0); OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, - i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], - 0, NULL, NULL)); + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, + i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], + 0, NULL, NULL)); //caffe_copy(data_dim, // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index cbb8a6fe..baad0dea 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -11,17 +11,17 @@ namespace caffe { -template +template void HDF5OutputLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { file_name_ = this->layer_param_.hdf5_output_param().file_name(); file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, - H5P_DEFAULT); + H5P_DEFAULT); CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; file_opened_ = true; } -template +template HDF5OutputLayer::~HDF5OutputLayer() { if (file_opened_) { herr_t status = H5Fclose(file_id_); @@ -29,74 +29,76 @@ HDF5OutputLayer::~HDF5OutputLayer() { } } -template +template void HDF5OutputLayer::SaveBlobs() { // TODO: no limit on the number of blobs LOG(INFO) << "Saving HDF5 file " << file_name_; CHECK_EQ(data_blob_.num(), label_blob_.num()) << - "data blob and label blob must have the same batch size"; + "data blob and label blob must have the same batch size"; hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; } -template +template void HDF5OutputLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_GE(bottom.size(), 2); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); + bottom[1]->height(), bottom[1]->width()); const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + &data_blob_.mutable_cpu_data()[i * data_datum_dim]); caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + &label_blob_.mutable_cpu_data()[i * label_datum_dim]); } SaveBlobs(); } -template +template void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { return; } -template +template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_GE(bottom.size(), 2); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); + bottom[1]->height(), bottom[1]->width()); const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { OCL_CHECK( - clEnqueueReadBuffer(amdDevice.CommandQueue, - (cl_mem) bottom[0]->gpu_data(), CL_TRUE, - i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, - &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[0]->gpu_data(), CL_TRUE, + i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, + &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); OCL_CHECK( - clEnqueueReadBuffer(amdDevice.CommandQueue, - (cl_mem) bottom[1]->gpu_data(), CL_TRUE, - i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, - &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL)); + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[1]->gpu_data(), CL_TRUE, + i * label_datum_dim * sizeof(Dtype), + sizeof(Dtype) * label_datum_dim, + &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, + NULL)); } SaveBlobs(); } -template +template void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { return; } diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index e01e1d6a..d415bd64 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -10,9 +10,9 @@ namespace caffe { -template +template void HingeLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const Dtype* label = bottom[1]->cpu_data(); @@ -27,7 +27,7 @@ void HingeLossLayer::Forward_cpu(const vector*>& bottom, for (int i = 0; i < num; ++i) { for (int j = 0; j < dim; ++j) { bottom_diff[i * dim + j] = std::max( - Dtype(0), 1 + bottom_diff[i * dim + j]); + Dtype(0), 1 + bottom_diff[i * dim + j]); } } Dtype* loss = top[0]->mutable_cpu_data(); @@ -43,12 +43,12 @@ void HingeLossLayer::Forward_cpu(const vector*>& bottom, } } -template +template void HingeLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index b29e47e2..a8ddc7fe 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -7,24 +7,24 @@ namespace caffe { -template +template void Im2colLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { ConvolutionParameter conv_param = this->layer_param_.convolution_param(); CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; if (conv_param.has_kernel_size()) { kernel_h_ = kernel_w_ = conv_param.kernel_size(); } else { @@ -47,65 +47,65 @@ void Im2colLayer::LayerSetUp(const vector*>& bottom, } } -template +template void Im2colLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; + << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); top[0]->Reshape( - bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, - (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, - (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); + bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, + (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, + (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); } -template +template void Im2colLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data + top[0]->offset(n)); } } -template +template void Im2colLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); } } -template +template void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data, top[0]->offset(n)); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data, top[0]->offset(n)); } } -template +template void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); } } diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 846bcc34..24ac8ffc 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -15,22 +15,22 @@ namespace caffe { -template +template ImageDataLayer::~ImageDataLayer() { this->JoinPrefetchThread(); } -template +template void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int new_height = this->layer_param_.image_data_param().new_height(); const int new_width = this->layer_param_.image_data_param().new_width(); const bool is_color = this->layer_param_.image_data_param().is_color(); string root_folder = this->layer_param_.image_data_param().root_folder(); CHECK((new_height == 0 && new_width == 0) || - (new_height > 0 && new_width > 0)) << "Current implementation requires " - "new_height and new_width to be set at the same time."; + (new_height > 0 && new_width > 0)) << "Current implementation requires " + "new_height and new_width to be set at the same time."; // Read the file with filenames and labels const string& source = this->layer_param_.image_data_param().source(); LOG(INFO) << "Opening file " << source; @@ -54,14 +54,14 @@ void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, // Check if we would need to randomly skip a few data points if (this->layer_param_.image_data_param().rand_skip()) { unsigned int skip = caffe_rng_rand() % - this->layer_param_.image_data_param().rand_skip(); + this->layer_param_.image_data_param().rand_skip(); LOG(INFO) << "Skipping first " << skip << " data points."; CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; lines_id_ = skip; } // Read an image, and use it to initialize the top blob. cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); + new_height, new_width, is_color); // Use data_transformer to infer the expected blob shape from a cv_image. vector top_shape = this->data_transformer_->InferBlobShape(cv_img); this->transformed_data_.Reshape(top_shape); @@ -72,23 +72,23 @@ void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, top[0]->ReshapeLike(this->prefetch_data_); LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); // label vector label_shape(1, batch_size); top[1]->Reshape(label_shape); this->prefetch_label_.Reshape(label_shape); } -template +template void ImageDataLayer::ShuffleImages() { caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); + static_cast(prefetch_rng_->generator()); shuffle(lines_.begin(), lines_.end(), prefetch_rng); } // This function is used to create a thread that prefetches the data. -template +template void ImageDataLayer::InternalThreadEntry() { CPUTimer batch_timer; batch_timer.Start(); @@ -107,7 +107,7 @@ void ImageDataLayer::InternalThreadEntry() { // Reshape according to the first image of each batch // on single input batches allows for inputs of varying dimension. cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); + new_height, new_width, is_color); // Use data_transformer to infer the expected blob shape from a cv_img. vector top_shape = this->data_transformer_->InferBlobShape(cv_img); this->transformed_data_.Reshape(top_shape); @@ -125,7 +125,7 @@ void ImageDataLayer::InternalThreadEntry() { timer.Start(); CHECK_GT(lines_size, lines_id_); cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); + new_height, new_width, is_color); CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first; read_time += timer.MicroSeconds(); timer.Start(); diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index e5294a7e..21414224 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -10,23 +10,23 @@ namespace caffe { -template +template void InfogainLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::LayerSetUp(bottom, top); if (bottom.size() < 3) { CHECK(this->layer_param_.infogain_loss_param().has_source()) - << "Infogain matrix source must be specified."; + << "Infogain matrix source must be specified."; BlobProto blob_proto; ReadProtoFromBinaryFile( - this->layer_param_.infogain_loss_param().source(), &blob_proto); + this->layer_param_.infogain_loss_param().source(), &blob_proto); infogain_.FromProto(blob_proto); } } -template +template void InfogainLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::Reshape(bottom, top); Blob < Dtype > *infogain = NULL; if (bottom.size() < 3) { @@ -45,9 +45,9 @@ void InfogainLossLayer::Reshape( CHECK_EQ(infogain->width(), dim); } -template +template void InfogainLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* bottom_label = bottom[1]->cpu_data(); const Dtype* infogain_mat = NULL; @@ -69,17 +69,17 @@ void InfogainLossLayer::Forward_cpu(const vector*>& bottom, top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void InfogainLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down.size() > 2 && propagate_down[2]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to infogain inputs."; + << " Layer cannot backpropagate to infogain inputs."; } if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index e563aa21..3beca42f 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -9,14 +9,14 @@ namespace caffe { -template +template void InnerProductLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_output = this->layer_param_.inner_product_param().num_output(); bias_term_ = this->layer_param_.inner_product_param().bias_term(); N_ = num_output; const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); + this->layer_param_.inner_product_param().axis()); // Dimensions starting from "axis" are "flattened" into a single // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), // and axis == 1, N inner products with dimension CHW are performed. @@ -37,29 +37,29 @@ void InnerProductLayer::LayerSetUp(const vector*>& bottom, this->blobs_[0].reset(new Blob(weight_shape)); // fill the weights shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( - this->layer_param_.inner_product_param().weight_filler())); + this->layer_param_.inner_product_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, intiialize and fill the bias term if (bias_term_) { vector bias_shape(1, N_); this->blobs_[1].reset(new Blob(bias_shape)); shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( - this->layer_param_.inner_product_param().bias_filler())); + this->layer_param_.inner_product_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } // parameter initialization this->param_propagate_down_.resize(this->blobs_.size(), true); } -template +template void InnerProductLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Figure out the dimensions const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); + this->layer_param_.inner_product_param().axis()); const int new_K = bottom[0]->count(axis); CHECK_EQ(K_, new_K) - << "Input size incompatible with inner product parameters."; + << "Input size incompatible with inner product parameters."; // The first "axis" dimensions are independent inner products; the total // number of these is M_, the product over these dimensions. M_ = bottom[0]->count(0, axis); @@ -77,92 +77,92 @@ void InnerProductLayer::Reshape(const vector*>& bottom, } } -template +template void InnerProductLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., - bottom_data, weight, (Dtype) 0., top_data); + bottom_data, weight, (Dtype) 0., top_data); if (bias_term_) { caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., - bias_multiplier_.cpu_data(), - this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); + bias_multiplier_.cpu_data(), + this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); } } -template +template void InnerProductLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (this->param_propagate_down_[0]) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); // Gradient with respect to weight caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., - top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); + top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bias caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff, - bias_multiplier_.cpu_data(), (Dtype) 1., - this->blobs_[1]->mutable_cpu_diff()); + bias_multiplier_.cpu_data(), (Dtype) 1., + this->blobs_[1]->mutable_cpu_diff()); } if (propagate_down[0]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bottom data caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., - bottom[0]->mutable_cpu_diff()); + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., + top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., + bottom[0]->mutable_cpu_diff()); } } -template +template void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., - bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); + bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); if (bias_term_) { caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., - bias_multiplier_.gpu_data(), 0, - this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); + bias_multiplier_.gpu_data(), 0, + this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); } } -template +template void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (this->param_propagate_down_[0]) { const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight caffe_gpu_gemm < Dtype - > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., - top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., + top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bias caffe_gpu_gemv < Dtype - > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, - (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), - (size_t) 0, (Dtype) 0., 1, - this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); + > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, + (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), + (size_t) 0, (Dtype) 0., 1, + this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); } if (propagate_down[0]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bottom data caffe_gpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., - top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., - bottom[0]->mutable_gpu_diff(), 0); + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., + top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., + bottom[0]->mutable_gpu_diff(), 0); } } diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index e388dfef..60b08d99 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -7,9 +7,9 @@ namespace caffe { -template +template void LogLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); const Dtype base = this->layer_param_.log_param().base(); if (base != Dtype(-1)) { @@ -19,22 +19,22 @@ void LogLayer::LayerSetUp(const vector*>& bottom, // Otherwise, calculate its log explicitly. const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; + << "NaN result: log(base) = log(" << base << ") = " << log_base; CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; + << "Inf result: log(base) = log(" << base << ") = " << log_base; base_scale_ = Dtype(1) / log_base; CHECK(!isnan(base_scale_)) - << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; + << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; CHECK(!isinf(base_scale_)) - << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; + << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; input_scale_ = this->layer_param_.log_param().scale(); input_shift_ = this->layer_param_.log_param().shift(); backward_num_scale_ = input_scale_ / log_base; } -template +template void LogLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); @@ -55,9 +55,9 @@ void LogLayer::Forward_cpu(const vector*>& bottom, } } -template +template void LogLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -79,9 +79,9 @@ void LogLayer::Backward_cpu(const vector*>& top, caffe_mul(count, top_diff, bottom_diff, bottom_diff); } -template +template void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); @@ -102,9 +102,9 @@ void LogLayer::Forward_gpu(const vector*>& bottom, } } -template +template void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp index 503014f5..f5da913a 100644 --- a/src/caffe/layers/loss_layer.cpp +++ b/src/caffe/layers/loss_layer.cpp @@ -10,20 +10,20 @@ namespace caffe { -template +template void LossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { // LossLayers have a non-zero (1) loss by default. if (this->layer_param_.loss_weight_size() == 0) { this->layer_param_.add_loss_weight(Dtype(1)); } } -template +template void LossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { CHECK_EQ(bottom[0]->num(), bottom[1]->num()) - << "The data and label should have the same number."; + << "The data and label should have the same number."; vector loss_shape(0); // Loss layers output a scalar; 0 axes. top[0]->Reshape(loss_shape); } diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 58f835b6..2dfcd645 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -8,9 +8,9 @@ namespace caffe { -template +template void LRNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { size_ = this->layer_param_.lrn_param().local_size(); CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; pre_pad_ = (size_ - 1) / 2; @@ -18,7 +18,7 @@ void LRNLayer::LayerSetUp(const vector*>& bottom, beta_ = this->layer_param_.lrn_param().beta(); k_ = this->layer_param_.lrn_param().k(); if (this->layer_param_.lrn_param().norm_region() == - LRNParameter_NormRegion_WITHIN_CHANNEL) { + LRNParameter_NormRegion_WITHIN_CHANNEL) { // Set up split_layer_ to use inputs in the numerator and denominator. split_top_vec_.clear(); split_top_vec_.push_back(&product_input_); @@ -40,7 +40,7 @@ void LRNLayer::LayerSetUp(const vector*>& bottom, pool_top_vec_.push_back(&pool_output_); LayerParameter pool_param; pool_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); + PoolingParameter_PoolMethod_AVE); pool_param.mutable_pooling_param()->set_pad(pre_pad_); pool_param.mutable_pooling_param()->set_kernel_size(size_); pool_layer_.reset(new PoolingLayer(pool_param)); @@ -68,33 +68,33 @@ void LRNLayer::LayerSetUp(const vector*>& bottom, } } -template +template void LRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - top[0]->Reshape(num_, channels_, height_, width_); - scale_.Reshape(num_, channels_, height_, width_); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - split_layer_->Reshape(bottom, split_top_vec_); - square_layer_->Reshape(square_bottom_vec_, square_top_vec_); - pool_layer_->Reshape(square_top_vec_, pool_top_vec_); - power_layer_->Reshape(pool_top_vec_, power_top_vec_); - product_layer_->Reshape(product_bottom_vec_, top); - break; - } + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + top[0]->Reshape(num_, channels_, height_, width_); + scale_.Reshape(num_, channels_, height_, width_); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + split_layer_->Reshape(bottom, split_top_vec_); + square_layer_->Reshape(square_bottom_vec_, square_top_vec_); + pool_layer_->Reshape(square_top_vec_, pool_top_vec_); + power_layer_->Reshape(pool_top_vec_, power_top_vec_); + product_layer_->Reshape(product_bottom_vec_, top); + break; + } } -template +template void LRNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: CrossChannelForward_cpu(bottom, top); @@ -107,9 +107,9 @@ void LRNLayer::Forward_cpu(const vector*>& bottom, } } -template +template void LRNLayer::CrossChannelForward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); Dtype* scale_data = scale_.mutable_cpu_data(); @@ -125,27 +125,27 @@ void LRNLayer::CrossChannelForward_cpu( for (int n = 0; n < num_; ++n) { // compute the padded square caffe_sqr(channels_ * height_ * width_, - bottom_data + bottom[0]->offset(n), - padded_square_data + padded_square.offset(0, pre_pad_)); + bottom_data + bottom[0]->offset(n), + padded_square_data + padded_square.offset(0, pre_pad_)); // Create the first channel scale for (int c = 0; c < size_; ++c) { caffe_axpy < Dtype > (height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c), - scale_data + scale_.offset(n, 0)); + padded_square_data + padded_square.offset(0, c), + scale_data + scale_.offset(n, 0)); } for (int c = 1; c < channels_; ++c) { // copy previous scale caffe_copy < Dtype > (height_ * width_, - scale_data + scale_.offset(n, c - 1), - scale_data + scale_.offset(n, c)); + scale_data + scale_.offset(n, c - 1), + scale_data + scale_.offset(n, c)); // add head caffe_axpy < Dtype > (height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c + size_ - 1), - scale_data + scale_.offset(n, c)); + padded_square_data + padded_square.offset(0, c + size_ - 1), + scale_data + scale_.offset(n, c)); // subtract tail caffe_axpy < Dtype > (height_ * width_, -alpha_over_size, - padded_square_data + padded_square.offset(0, c - 1), - scale_data + scale_.offset(n, c)); + padded_square_data + padded_square.offset(0, c - 1), + scale_data + scale_.offset(n, c)); } } @@ -154,9 +154,9 @@ void LRNLayer::CrossChannelForward_cpu( caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data); } -template +template void LRNLayer::WithinChannelForward( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { split_layer_->Forward(bottom, split_top_vec_); square_layer_->Forward(square_bottom_vec_, square_top_vec_); pool_layer_->Forward(square_top_vec_, pool_top_vec_); @@ -164,9 +164,9 @@ void LRNLayer::WithinChannelForward( product_layer_->Forward(product_bottom_vec_, top); } -template +template void LRNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: CrossChannelBackward_cpu(top, propagate_down, bottom); @@ -179,10 +179,10 @@ void LRNLayer::Backward_cpu(const vector*>& top, } } -template +template void LRNLayer::CrossChannelBackward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -206,82 +206,82 @@ void LRNLayer::CrossChannelBackward_cpu( int block_offset = scale_.offset(n); // first, compute diff_i * y_i / s_i caffe_mul < Dtype > (channels_ * height_ * width_, - top_diff + block_offset, top_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + top_diff + block_offset, top_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); caffe_div < Dtype > (channels_ * height_ * width_, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), - scale_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), + scale_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); // Now, compute the accumulated ratios and the bottom diff caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); for (int c = 0; c < size_ - 1; ++c) { caffe_axpy < Dtype > (height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); } for (int c = 0; c < channels_; ++c) { caffe_axpy < Dtype > (height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), - accum_ratio_data); + padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), + accum_ratio_data); // compute bottom diff caffe_mul < Dtype > (height_ * width_, - bottom_data + top[0]->offset(n, c), - accum_ratio_data, accum_ratio_times_bottom); + bottom_data + top[0]->offset(n, c), + accum_ratio_data, accum_ratio_times_bottom); caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value, - accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); + accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); caffe_axpy < Dtype > (height_ * width_, -1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); } } } -template +template void LRNLayer::WithinChannelBackward( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { vector product_propagate_down(2, true); product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); square_layer_->Backward(square_top_vec_, propagate_down, - square_bottom_vec_); + square_bottom_vec_); split_layer_->Backward(split_top_vec_, propagate_down, bottom); } } -template +template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); } -template +template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); } -template +template void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: CrossChannelForward_gpu(bottom, top); @@ -294,9 +294,9 @@ void LRNLayer::Forward_gpu(const vector*>& bottom, } } -template +template void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: CrossChannelBackward_gpu(top, propagate_down, bottom); diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 2cd04f93..e3b12908 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -8,17 +8,17 @@ namespace caffe { -template +template void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { batch_size_ = this->layer_param_.memory_data_param().batch_size(); channels_ = this->layer_param_.memory_data_param().channels(); height_ = this->layer_param_.memory_data_param().height(); width_ = this->layer_param_.memory_data_param().width(); size_ = channels_ * height_ * width_; CHECK_GT(batch_size_ * size_, 0) << - "batch_size, channels, height, and width must be specified and" - " positive in memory_data_param"; + "batch_size, channels, height, and width must be specified and" + " positive in memory_data_param"; vector label_shape(1, batch_size_); top[0]->Reshape(batch_size_, channels_, height_, width_); top[1]->Reshape(label_shape); @@ -30,14 +30,14 @@ void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, added_label_.cpu_data(); } -template +template void MemoryDataLayer::AddDatumVector(const vector& datum_vector) { CHECK(!has_new_data_) << - "Can't add data until current data has been consumed."; + "Can't add data until current data has been consumed."; size_t num = datum_vector.size(); CHECK_GT(num, 0) << "There is no datum to add."; CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; + "The added data must be a multiple of the batch size."; added_data_.Reshape(num, channels_, height_, width_); added_label_.Reshape(num, 1, 1, 1); // Apply data transformations (mirror, scale, crop...) @@ -53,15 +53,15 @@ void MemoryDataLayer::AddDatumVector(const vector& datum_vector) { has_new_data_ = true; } -template +template void MemoryDataLayer::AddMatVector(const vector& mat_vector, - const vector& labels) { + const vector& labels) { size_t num = mat_vector.size(); CHECK(!has_new_data_) << - "Can't add mat until current data has been consumed."; + "Can't add mat until current data has been consumed."; CHECK_GT(num, 0) << "There is no mat to add"; CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; + "The added data must be a multiple of the batch size."; added_data_.Reshape(num, channels_, height_, width_); added_label_.Reshape(num, 1, 1, 1); // Apply data transformations (mirror, scale, crop...) @@ -77,7 +77,7 @@ void MemoryDataLayer::AddMatVector(const vector& mat_vector, has_new_data_ = true; } -template +template void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { CHECK(data); CHECK(labels); @@ -93,18 +93,18 @@ void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { pos_ = 0; } -template +template void MemoryDataLayer::set_batch_size(int new_size) { CHECK(!has_new_data_) << - "Can't change batch_size until current data has been consumed."; + "Can't change batch_size until current data has been consumed."; batch_size_ = new_size; added_data_.Reshape(batch_size_, channels_, height_, width_); added_label_.Reshape(batch_size_, 1, 1, 1); } -template +template void MemoryDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; top[0]->Reshape(batch_size_, channels_, height_, width_); top[1]->Reshape(batch_size_, 1, 1, 1); diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 5e57cf85..358ed891 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -10,18 +10,18 @@ namespace caffe { -template +template void MultinomialLogisticLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::Reshape(bottom, top); CHECK_EQ(bottom[1]->channels(), 1); CHECK_EQ(bottom[1]->height(), 1); CHECK_EQ(bottom[1]->width(), 1); } -template +template void MultinomialLogisticLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* bottom_label = bottom[1]->cpu_data(); int num = bottom[0]->num(); @@ -30,19 +30,19 @@ void MultinomialLogisticLossLayer::Forward_cpu( for (int i = 0; i < num; ++i) { int label = static_cast(bottom_label[i]); Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); loss -= log(prob); } top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void MultinomialLogisticLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -55,7 +55,7 @@ void MultinomialLogisticLossLayer::Backward_cpu( for (int i = 0; i < num; ++i) { int label = static_cast(bottom_label[i]); Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); bottom_diff[i * dim + label] = scale / prob; } } diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 0bd4e989..0a6613d7 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -7,27 +7,27 @@ namespace caffe { -template +template void MVNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); + 1, 1); variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); + 1, 1); temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); sum_multiplier_.Reshape(1, 1, - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); eps_ = this->layer_param_.mvn_param().eps(); } -template +template void MVNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); int num; @@ -41,56 +41,56 @@ void MVNLayer::Forward_cpu(const vector*>& bottom, if (this->layer_param_.mvn_param().normalize_variance()) { // put the squares of bottom into temp_ caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); + temp_.mutable_cpu_data()); // computes variance using var(X) = E(X^2) - (EX)^2 caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX caffe_cpu_gemv < Dtype - > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), - sum_multiplier_.cpu_data(), 0., - variance_.mutable_cpu_data()); // E(X^2) + > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), + sum_multiplier_.cpu_data(), 0., + variance_.mutable_cpu_data()); // E(X^2) caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), - temp_.mutable_cpu_data()); // (EX)^2 + temp_.mutable_cpu_data()); // (EX)^2 caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), - variance_.mutable_cpu_data()); // variance + variance_.mutable_cpu_data()); // variance // do mean and variance normalization // subtract mean caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); // normalize variance caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), - variance_.mutable_cpu_data()); + variance_.mutable_cpu_data()); caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); } else { caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX // subtract mean caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); } } -template +template void MVNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -107,27 +107,27 @@ void MVNLayer::Backward_cpu(const vector*>& top, if (this->layer_param_.mvn_param().normalize_variance()) { caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - bottom_diff); + mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., + bottom_diff); caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., - bottom_diff); + mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., + bottom_diff); caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); + bottom_diff); // put the squares of bottom into temp_ caffe_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); + temp_.mutable_cpu_data()); caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., + temp_.mutable_cpu_data()); caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); } else { @@ -135,9 +135,9 @@ void MVNLayer::Backward_cpu(const vector*>& top, } } -template +template void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); int num; @@ -151,55 +151,55 @@ void MVNLayer::Forward_gpu(const vector*>& bottom, if (this->layer_param_.mvn_param().normalize_variance()) { // put the squares of bottom into temp_ caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); + temp_.mutable_gpu_data()); // computes variance using var(X) = E(X^2) - (EX)^2 caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX caffe_gpu_gemv < Dtype - > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) + > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), + sum_multiplier_.gpu_data(), 0., + variance_.mutable_gpu_data()); // E(X^2) caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 + temp_.mutable_gpu_data()); // (EX)^2 caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance + variance_.mutable_gpu_data()); // variance // do mean and variance normalization // subtract mean caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); // normalize variance caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); + variance_.mutable_gpu_data()); caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); } else { caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX // subtract mean caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); } } -template +template void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* top_data = top[0]->gpu_data(); const Dtype* bottom_data = bottom[0]->gpu_data(); @@ -216,36 +216,36 @@ void MVNLayer::Backward_gpu(const vector*>& top, if (this->layer_param_.mvn_param().normalize_variance()) { caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + bottom_diff); caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); + mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., + bottom_diff); caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); + bottom_diff); // put the squares of bottom into temp_ caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); + temp_.mutable_gpu_data()); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); + variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); } else { caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); } } diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp index 2a0a2088..a9edeffd 100644 --- a/src/caffe/layers/neuron_layer.cpp +++ b/src/caffe/layers/neuron_layer.cpp @@ -5,9 +5,9 @@ namespace caffe { -template +template void NeuronLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { top[0]->ReshapeLike(*bottom[0]); } diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index d66a24f6..92c71582 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -13,30 +13,30 @@ namespace caffe { using std::min; using std::max; -template +template void PoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { PoolingParameter pool_param = this->layer_param_.pooling_param(); if (pool_param.global_pooling()) { CHECK(!(pool_param.has_kernel_size() || - pool_param.has_kernel_h() || pool_param.has_kernel_w())) - << "With Global_pooling: true Filter size cannot specified"; + pool_param.has_kernel_h() || pool_param.has_kernel_w())) + << "With Global_pooling: true Filter size cannot specified"; } else { CHECK(!pool_param.has_kernel_size() != - !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; CHECK(pool_param.has_kernel_size() || - (pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; + (pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; } CHECK((!pool_param.has_pad() && pool_param.has_pad_h() - && pool_param.has_pad_w()) - || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; + && pool_param.has_pad_w()) + || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; CHECK((!pool_param.has_stride() && pool_param.has_stride_h() - && pool_param.has_stride_w()) - || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; + && pool_param.has_stride_w()) + || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; global_pooling_ = pool_param.global_pooling(); if (global_pooling_) { kernel_h_ = bottom[0]->height(); @@ -65,24 +65,24 @@ void PoolingLayer::LayerSetUp(const vector*>& bottom, } if (global_pooling_) { CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; + << "With Global_pooling: true; only pad = 0 and stride = 1"; } if (pad_h_ != 0 || pad_w_ != 0) { CHECK(this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_AVE - || this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_MAX) - << "Padding implemented only for average and max pooling."; + == PoolingParameter_PoolMethod_AVE + || this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) + << "Padding implemented only for average and max pooling."; CHECK_LT(pad_h_, kernel_h_); CHECK_LT(pad_w_, kernel_w_); } } -template +template void PoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; + << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); @@ -91,9 +91,9 @@ void PoolingLayer::Reshape(const vector*>& bottom, kernel_w_ = bottom[0]->width(); } pooled_height_ = static_cast(ceil(static_cast( - height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; + height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; pooled_width_ = static_cast(ceil(static_cast( - width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; + width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; if (pad_h_ || pad_w_) { // If we have padding, ensure that the last pooling starts strictly // inside the image (instead of at the padding); otherwise clip the last. @@ -107,29 +107,29 @@ void PoolingLayer::Reshape(const vector*>& bottom, CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); } top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); + pooled_width_); if (top.size() > 1) { top[1]->ReshapeLike(*top[0]); } // If max pooling, we will initialize the vector index part. if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX && top.size() == 1) { + PoolingParameter_PoolMethod_MAX && top.size() == 1) { max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); + pooled_width_); } // If stochastic pooling, we will initialize the random index part. if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_STOCHASTIC) { + PoolingParameter_PoolMethod_STOCHASTIC) { rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); + pooled_width_); } } // TODO(Yangqing): Is there a faster way to do pooling in the channel-first // case? -template +template void PoolingLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int top_count = top[0]->count(); @@ -209,7 +209,7 @@ void PoolingLayer::Forward_cpu(const vector*>& bottom, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { top_data[ph * pooled_width_ + pw] += - bottom_data[h * width_ + w]; + bottom_data[h * width_ + w]; } } top_data[ph * pooled_width_ + pw] /= pool_size; @@ -229,9 +229,9 @@ void PoolingLayer::Forward_cpu(const vector*>& bottom, } } -template +template void PoolingLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -258,7 +258,7 @@ void PoolingLayer::Backward_cpu(const vector*>& top, for (int pw = 0; pw < pooled_width_; ++pw) { const int index = ph * pooled_width_ + pw; const int bottom_index = - use_top_mask ? top_mask[index] : mask[index]; + use_top_mask ? top_mask[index] : mask[index]; bottom_diff[bottom_index] += top_diff[index]; } } @@ -290,7 +290,7 @@ void PoolingLayer::Backward_cpu(const vector*>& top, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { bottom_diff[h * width_ + w] += - top_diff[ph * pooled_width_ + pw] / pool_size; + top_diff[ph * pooled_width_ + pw] / pool_size; } } } @@ -309,9 +309,9 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } } -template +template void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { //Forward_cpu(bottom, top); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); @@ -329,31 +329,31 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, } // NOLINT_NEXT_LINE(whitespace/operators) MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, + mask, top_mask); break; case PoolingParameter_PoolMethod_AVE: // NOLINT_NEXT_LINE(whitespace/operators) AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); break; case PoolingParameter_PoolMethod_STOCHASTIC: if (this->phase_ == TRAIN) { // We need to create the random index as well. caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); + rand_idx_.mutable_gpu_data()); // NOLINT_NEXT_LINE(whitespace/operators) StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, + rand_idx_.mutable_gpu_data(), top_data); } else { // NOLINT_NEXT_LINE(whitespace/operators) StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, top_data); } break; default: @@ -361,9 +361,9 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, } } -template +template void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { //Backward_cpu(top, propagate_down, bottom); if (!propagate_down[0]) { return; @@ -385,22 +385,22 @@ void PoolingLayer::Backward_gpu(const vector*>& top, } // NOLINT_NEXT_LINE(whitespace/operators) MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); + height_, width_, pooled_height_, pooled_width_, + kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, + bottom_diff); break; case PoolingParameter_PoolMethod_AVE: // NOLINT_NEXT_LINE(whitespace/operators) AvePoolBackward(count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); + height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); break; case PoolingParameter_PoolMethod_STOCHASTIC: // NOLINT_NEXT_LINE(whitespace/operators) StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); + top[0]->num(), channels_, height_, width_, pooled_height_, + pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, + bottom_diff); break; default: LOG(FATAL) << "Unknown pooling method."; diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index e4a3e456..93ef9e1f 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -9,9 +9,9 @@ namespace caffe { -template +template void PowerLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); power_ = this->layer_param_.power_param().power(); scale_ = this->layer_param_.power_param().scale(); @@ -20,9 +20,9 @@ void PowerLayer::LayerSetUp(const vector*>& bottom, } // Compute y = (shift + scale * x)^power -template +template void PowerLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. @@ -44,10 +44,10 @@ void PowerLayer::Forward_cpu(const vector*>& bottom, } } -template +template void PowerLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const int count = bottom[0]->count(); @@ -63,7 +63,7 @@ void PowerLayer::Backward_cpu(const vector*>& top, // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); + Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); } @@ -96,9 +96,9 @@ void PowerLayer::Backward_cpu(const vector*>& top, } } -template +template void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. @@ -120,9 +120,9 @@ void PowerLayer::Forward_gpu(const vector*>& bottom, } } -template +template void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int count = bottom[0]->count(); @@ -138,7 +138,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); + Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); } diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 5332a178..cbf7f064 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -7,11 +7,11 @@ namespace caffe { -template +template void PReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; + << "Number of axes of bottom blob must be >=2."; PReLUParameter prelu_param = this->layer_param().prelu_param(); int channels = bottom[0]->channels(); channel_shared_ = prelu_param.channel_shared(); @@ -37,10 +37,10 @@ void PReLULayer::LayerSetUp(const vector*>& bottom, } if (channel_shared_) { CHECK_EQ(this->blobs_[0]->count(), 1) - << "Negative slope size is inconsistent with prototxt config"; + << "Negative slope size is inconsistent with prototxt config"; } else { CHECK_EQ(this->blobs_[0]->count(), channels) - << "Negative slope size is inconsistent with prototxt config"; + << "Negative slope size is inconsistent with prototxt config"; } // Propagate gradients to the parameters (as directed by backward pass). @@ -50,11 +50,11 @@ void PReLULayer::LayerSetUp(const vector*>& bottom, caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data()); } -template +template void PReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; + << "Number of axes of bottom blob must be >=2."; top[0]->ReshapeLike(*bottom[0]); if (bottom[0] == top[0]) { // For in-place computation @@ -62,9 +62,9 @@ void PReLULayer::Reshape(const vector*>& bottom, } } -template +template void PReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); @@ -83,14 +83,14 @@ void PReLULayer::Forward_cpu(const vector*>& bottom, for (int i = 0; i < count; ++i) { int c = (i / dim) % channels / div_factor; top_data[i] = std::max(bottom_data[i], Dtype(0)) - + slope_data[c] * std::min(bottom_data[i], Dtype(0)); + + slope_data[c] * std::min(bottom_data[i], Dtype(0)); } } -template +template void PReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* slope_data = this->blobs_[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -124,14 +124,14 @@ void PReLULayer::Backward_cpu(const vector*>& top, for (int i = 0; i < count; ++i) { int c = (i / dim) % channels / div_factor; bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + slope_data[c] * (bottom_data[i] <= 0)); + + slope_data[c] * (bottom_data[i] <= 0)); } } } -template +template void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -144,12 +144,12 @@ void PReLULayer::Forward_gpu(const vector*>& bottom, caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); } PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, - div_factor); + div_factor); } -template +template void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); const int count = bottom[0]->count(); @@ -172,18 +172,18 @@ void PReLULayer::Backward_gpu(const vector*>& top, // compute element-wise diff // NOLINT_NEXT_LINE(whitespace/operators) PReLUParamBackward( - cdim, top_diff, top[0]->offset(n), - bottom_data, bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); + cdim, top_diff, top[0]->offset(n), + bottom_data, bottom[0]->offset(n), + backward_buff_.mutable_gpu_diff()); if (channel_shared_) { Dtype d; caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); + multiplier_.gpu_data(), &d); dsum += d; } else { caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); + backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., + slope_diff); } } if (channel_shared_) { @@ -197,8 +197,8 @@ void PReLULayer::Backward_gpu(const vector*>& top, int div_factor = channel_shared_ ? channels : 1; // NOLINT_NEXT_LINE(whitespace/operators) PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, - slope_data, - div_factor); + slope_data, + div_factor); } } diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 32ea4bc0..ddf70e46 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -8,29 +8,29 @@ namespace caffe { -template +template void ReductionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { op_ = this->layer_param_.reduction_param().operation(); } -template +template void ReductionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { axis_ = bottom[0]->CanonicalAxisIndex( - this->layer_param_.reduction_param().axis()); + this->layer_param_.reduction_param().axis()); // In the output, we'll keep all axes up to the reduction axis, but // throw away any after that. // Note: currently reducing along non-tail axes is not supported; otherwise, // we'd need to also copy any axes following an "end_axis". vector top_shape(bottom[0]->shape().begin(), - bottom[0]->shape().begin() + axis_); + bottom[0]->shape().begin() + axis_); top[0]->Reshape(top_shape); num_ = bottom[0]->count(0, axis_); dim_ = bottom[0]->count(axis_); CHECK_EQ(num_, top[0]->count()); if (op_ == ReductionParameter_ReductionOp_SUM || - op_ == ReductionParameter_ReductionOp_MEAN) { + op_ == ReductionParameter_ReductionOp_MEAN) { vector sum_mult_shape(1, dim_); sum_multiplier_.Reshape(sum_mult_shape); caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); @@ -41,9 +41,9 @@ void ReductionLayer::Reshape(const vector*>& bottom, } } -template +template void ReductionLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* mult_data = NULL; if (sum_multiplier_.count() > 0) { @@ -64,7 +64,7 @@ void ReductionLayer::Forward_cpu( break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } bottom_data += dim_; ++top_data; @@ -76,9 +76,9 @@ void ReductionLayer::Forward_cpu( } } -template +template void ReductionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -96,7 +96,7 @@ void ReductionLayer::Backward_cpu(const vector*>& top, break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -116,7 +116,7 @@ void ReductionLayer::Backward_cpu(const vector*>& top, break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } bottom_data += dim_; bottom_diff += dim_; @@ -124,9 +124,9 @@ void ReductionLayer::Backward_cpu(const vector*>& top, } } -template +template void ReductionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* mult_data = NULL; if (sum_multiplier_.count() > 0) { @@ -147,7 +147,7 @@ void ReductionLayer::Forward_gpu(const vector*>& bottom, break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } bottom_data += dim_; ++top_data; @@ -159,9 +159,9 @@ void ReductionLayer::Forward_gpu(const vector*>& bottom, } } -template +template void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -179,7 +179,7 @@ void ReductionLayer::Backward_gpu(const vector*>& top, break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); @@ -199,7 +199,7 @@ void ReductionLayer::Backward_gpu(const vector*>& top, break; default: LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); + << ReductionParameter_ReductionOp_Name(op_); } bottom_data += dim_; bottom_diff += dim_; diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 7f3b2729..334dc244 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -5,23 +5,23 @@ #include "caffe/vision_layers.hpp" namespace caffe { -template +template void ReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); for (int i = 0; i < count; ++i) { top_data[i] = std::max(bottom_data[i], Dtype(0)) - + negative_slope * std::min(bottom_data[i], Dtype(0)); + + negative_slope * std::min(bottom_data[i], Dtype(0)); } } -template +template void ReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -30,14 +30,14 @@ void ReLULayer::Backward_cpu(const vector*>& top, Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); for (int i = 0; i < count; ++i) { bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + negative_slope * (bottom_data[i] <= 0)); + + negative_slope * (bottom_data[i] <= 0)); } } } -template +template void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -45,10 +45,10 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, ReLUForward(count, bottom_data, top_data, negative_slope); } -template +template void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp index 8dbbbcb0..094e61ef 100644 --- a/src/caffe/layers/reshape_layer.cpp +++ b/src/caffe/layers/reshape_layer.cpp @@ -5,9 +5,9 @@ namespace caffe { -template +template void ReshapeLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { inferred_axis_ = -1; copy_axes_.clear(); const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); @@ -19,7 +19,7 @@ void ReshapeLayer::LayerSetUp(const vector*>& bottom, copy_axes_.push_back(i); } else if (top_dim == -1) { CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple " - << "-1 dims; at most a single (1) value of -1 may be specified"; + << "-1 dims; at most a single (1) value of -1 may be specified"; inferred_axis_ = i; } else { constant_count_ *= top_dim; @@ -27,22 +27,22 @@ void ReshapeLayer::LayerSetUp(const vector*>& bottom, } } -template +template void ReshapeLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int input_start_axis = this->layer_param_.reshape_param().axis(); const int start_axis = - (input_start_axis >= 0) ? input_start_axis : - bottom[0]->num_axes() + input_start_axis + 1; + (input_start_axis >= 0) ? input_start_axis : + bottom[0]->num_axes() + input_start_axis + 1; CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis - << " out of range for " << bottom[0]->num_axes() << "-D input blob"; + << " out of range for " << bottom[0]->num_axes() << "-D input blob"; const int num_axes = this->layer_param_.reshape_param().num_axes(); CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all"; const int end_axis = - (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); + (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); CHECK_LE(end_axis, bottom[0]->num_axes()) - << "end_axis = axis + num_axes is out of range"; + << "end_axis = axis + num_axes is out of range"; const int num_axes_replaced = end_axis - start_axis; const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced; const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); @@ -62,10 +62,10 @@ void ReshapeLayer::Reshape(const vector*>& bottom, for (int i = 0; i < copy_axes_.size(); ++i) { const int copy_axis_index = copy_axes_[i]; CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) - << "new shape contains a 0, but there was no corresponding bottom axis " - << "to copy"; + << "new shape contains a 0, but there was no corresponding bottom axis " + << "to copy"; top_shape[start_axis + copy_axis_index] = - bottom[0]->shape(start_axis + copy_axis_index); + bottom[0]->shape(start_axis + copy_axis_index); } if (inferred_axis_ >= 0) { // A -1 dim was specified; infer the correct dimension by computing the @@ -78,14 +78,14 @@ void ReshapeLayer::Reshape(const vector*>& bottom, explicit_count *= top_shape[start_axis + copy_axis_index]; } CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count (" - << bottom[0]->count() << ") must be divisible by the product of " - << "the specified dimensions (" << explicit_count << ")"; + << bottom[0]->count() << ") must be divisible by the product of " + << "the specified dimensions (" << explicit_count << ")"; const int inferred_dim = bottom[0]->count() / explicit_count; top_shape[start_axis + inferred_axis_] = inferred_dim; } top[0]->Reshape(top_shape); CHECK_EQ(top[0]->count(), bottom[0]->count()) - << "output count must match input count"; + << "output count must match input count"; top[0]->ShareData(*bottom[0]); top[0]->ShareDiff(*bottom[0]); } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index a5be48e7..2a6d99e2 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -8,9 +8,9 @@ namespace caffe { -template +template void SigmoidCrossEntropyLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::LayerSetUp(bottom, top); sigmoid_bottom_vec_.clear(); sigmoid_bottom_vec_.push_back(bottom[0]); @@ -19,18 +19,18 @@ void SigmoidCrossEntropyLossLayer::LayerSetUp( sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_); } -template +template void SigmoidCrossEntropyLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::Reshape(bottom, top); CHECK_EQ(bottom[0]->count(), bottom[1]->count()) << - "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; + "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); } -template +template void SigmoidCrossEntropyLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { // The forward pass computes the sigmoid outputs. sigmoid_bottom_vec_[0] = bottom[0]; sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_); @@ -43,18 +43,18 @@ void SigmoidCrossEntropyLossLayer::Forward_cpu( Dtype loss = 0; for (int i = 0; i < count; ++i) { loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); + log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); } top[0]->mutable_cpu_data()[0] = loss / num; } -template +template void SigmoidCrossEntropyLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { // First, compute the diff @@ -70,13 +70,13 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( } } -template +template void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { // First, compute the diff diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 4095ccdb..833e1ced 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -8,14 +8,14 @@ namespace caffe { -template +template inline Dtype sigmoid(Dtype x) { return 1. / (1. + exp(-x)); } -template +template void SigmoidLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); @@ -24,10 +24,10 @@ void SigmoidLayer::Forward_cpu(const vector*>& bottom, } } -template +template void SigmoidLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -40,9 +40,9 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, } } -template +template void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -50,9 +50,9 @@ void SigmoidLayer::Forward_gpu(const vector*>& bottom, SigmoidForward(count, bottom_data, top_data); } -template +template void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index 05929a70..502d0aab 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -6,30 +6,30 @@ namespace caffe { -template +template void SilenceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < bottom.size(); ++i) { if (propagate_down[i]) { caffe_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_cpu_data()); + bottom[i]->mutable_cpu_data()); } } } -template +template void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Do nothing. } -template +template void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < bottom.size(); ++i) { if (propagate_down[i]) { caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); + bottom[i]->mutable_gpu_data()); } } } diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index 7b327527..a005ceba 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -7,21 +7,21 @@ namespace caffe { -template +template void SliceLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const SliceParameter& slice_param = this->layer_param_.slice_param(); CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) - << "Either axis or slice_dim should be specified; not both."; + << "Either axis or slice_dim should be specified; not both."; slice_point_.clear(); std::copy(slice_param.slice_point().begin(), - slice_param.slice_point().end(), - std::back_inserter(slice_point_)); + slice_param.slice_point().end(), + std::back_inserter(slice_point_)); } -template +template void SliceLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_axes = bottom[0]->num_axes(); const SliceParameter& slice_param = this->layer_param_.slice_param(); if (slice_param.has_slice_dim()) { @@ -29,8 +29,8 @@ void SliceLayer::Reshape(const vector*>& bottom, // Don't allow negative indexing for slice_dim, a uint32 -- almost // certainly unintended. CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 " - << "produced negative result; slice_dim must satisfy " - << "0 <= slice_dim < " << kMaxBlobAxes; + << "produced negative result; slice_dim must satisfy " + << "0 <= slice_dim < " << kMaxBlobAxes; CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range."; } else { slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis()); @@ -58,8 +58,8 @@ void SliceLayer::Reshape(const vector*>& bottom, } } else { CHECK_EQ(bottom_slice_axis % top.size(), 0) - << "Number of top blobs (" << top.size() << ") should evenly " - << "divide input slice axis (" << bottom_slice_axis << ")"; + << "Number of top blobs (" << top.size() << ") should evenly " + << "divide input slice axis (" << bottom_slice_axis << ")"; top_shape[slice_axis_] = bottom_slice_axis / top.size(); for (int i = 0; i < top.size(); ++i) { top[i]->Reshape(top_shape); @@ -69,9 +69,9 @@ void SliceLayer::Reshape(const vector*>& bottom, CHECK_EQ(count, bottom[0]->count()); } -template +template void SliceLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int offset_slice_axis = 0; const Dtype* bottom_data = bottom[0]->cpu_data(); const int bottom_slice_axis = bottom[0]->shape(slice_axis_); @@ -81,17 +81,17 @@ void SliceLayer::Forward_cpu(const vector*>& bottom, for (int n = 0; n < num_slices_; ++n) { const int top_offset = n * top_slice_axis * slice_size_; const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; + (n * bottom_slice_axis + offset_slice_axis) * slice_size_; caffe_copy(top_slice_axis * slice_size_, - bottom_data + bottom_offset, top_data + top_offset); + bottom_data + bottom_offset, top_data + top_offset); } offset_slice_axis += top_slice_axis; } } -template +template void SliceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -104,22 +104,22 @@ void SliceLayer::Backward_cpu(const vector*>& top, for (int n = 0; n < num_slices_; ++n) { const int top_offset = n * top_slice_axis * slice_size_; const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; + (n * bottom_slice_axis + offset_slice_axis) * slice_size_; caffe_copy(top_slice_axis * slice_size_, - top_diff + top_offset, bottom_diff + bottom_offset); + top_diff + top_offset, bottom_diff + bottom_offset); } offset_slice_axis += top_slice_axis; } } -template +template void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } -template +template void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { } #ifdef CPU_ONLY diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 24d1e4b8..d4cab577 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -7,11 +7,11 @@ namespace caffe { -template +template void SoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); top[0]->ReshapeLike(*bottom[0]); vector mult_dims(1, bottom[0]->shape(softmax_axis_)); sum_multiplier_.Reshape(mult_dims); @@ -24,13 +24,13 @@ void SoftmaxLayer::Reshape(const vector*>& bottom, scale_.Reshape(scale_dims); } -template +template SoftmaxLayer::~SoftmaxLayer() { } -template +template void SoftmaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); Dtype* scale_data = scale_.mutable_cpu_data(); @@ -45,17 +45,17 @@ void SoftmaxLayer::Forward_cpu(const vector*>& bottom, for (int j = 0; j < channels; j++) { for (int k = 0; k < inner_num_; k++) { scale_data[k] = std::max(scale_data[k], - bottom_data[i * dim + j * inner_num_ + k]); + bottom_data[i * dim + j * inner_num_ + k]); } } // subtraction caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_, - 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); + 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); // exponentiation caffe_exp < Dtype > (dim, top_data, top_data); // sum after exp caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1., - top_data, sum_multiplier_.cpu_data(), 0., scale_data); + top_data, sum_multiplier_.cpu_data(), 0., scale_data); // division for (int j = 0; j < channels; j++) { caffe_div(inner_num_, top_data, scale_data, top_data); @@ -64,10 +64,10 @@ void SoftmaxLayer::Forward_cpu(const vector*>& bottom, } } -template +template void SoftmaxLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -79,21 +79,22 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, // compute dot(top_diff, top_data) and subtract them from the bottom diff for (int k = 0; k < inner_num_; ++k) { scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels, - bottom_diff + i * dim + k, inner_num_, - top_data + i * dim + k, inner_num_); + bottom_diff + i * dim + k, inner_num_, + top_data + i * dim + k, inner_num_); } // subtraction caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, - -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim); + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, + -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + + i * dim); } // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } -template +template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); Dtype* scale_data = scale_.mutable_gpu_data(); @@ -107,27 +108,27 @@ void SoftmaxLayer::Forward_gpu(const vector*>& bottom, // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data, - scale_data); + scale_data); // subtract // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, top_data); + scale_data, top_data); // exponentiate // NOLINT_NEXT_LINE(whitespace/operators) kernel_exp < Dtype > (count, top_data, top_data); // sum after exp // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data, - scale_data); + scale_data); // divide // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, top_data); + scale_data, top_data); } -template +template void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* top_data = top[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); @@ -139,10 +140,10 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); + top_diff, top_data, scale_data); // NOLINT_NEXT_LINE(whitespace/operators) kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); + scale_data, bottom_diff); // elementwise multiplication caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index b998c2f6..58872a72 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -9,9 +9,9 @@ namespace caffe { -template +template void SoftmaxWithLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::LayerSetUp(bottom, top); LayerParameter softmax_param(this->layer_param_); softmax_param.set_type("Softmax"); @@ -23,7 +23,7 @@ void SoftmaxWithLossLayer::LayerSetUp( softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); has_ignore_label_ = - this->layer_param_.loss_param().has_ignore_label(); + this->layer_param_.loss_param().has_ignore_label(); if (has_ignore_label_) { ignore_label_ = this->layer_param_.loss_param().ignore_label(); } @@ -32,40 +32,40 @@ void SoftmaxWithLossLayer::LayerSetUp( ocl_setup(); } -template +template void SoftmaxWithLossLayer::ocl_setup() { d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, - sizeof(Dtype), NULL, NULL); + sizeof(Dtype), NULL, NULL); } -template +template SoftmaxWithLossLayer::~SoftmaxWithLossLayer() { } -template +template void SoftmaxWithLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { LossLayer < Dtype > ::Reshape(bottom, top); softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); outer_num_ = bottom[0]->count(0, softmax_axis_); inner_num_ = bottom[0]->count(softmax_axis_ + 1); CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; + << "Number of labels must match number of predictions; " + << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; if (top.size() >= 2) { // softmax output top[1]->ReshapeLike(*bottom[0]); } } -template +template void SoftmaxWithLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { // The forward pass computes the softmax prob values. softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); const Dtype* prob_data = prob_.cpu_data(); @@ -82,7 +82,7 @@ void SoftmaxWithLossLayer::Forward_cpu( DCHECK_GE(label_value, 0); DCHECK_LT(label_value, prob_.shape(softmax_axis_)); loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j], - Dtype(FLT_MIN))); + Dtype(FLT_MIN))); ++count; } } @@ -96,12 +96,12 @@ void SoftmaxWithLossLayer::Forward_cpu( } } -template +template void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -133,9 +133,9 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } -template +template void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { + const vector*>& bottom, const vector*>& top) { softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); const Dtype* prob_data = prob_.gpu_data(); const Dtype* label = bottom[1]->gpu_data(); @@ -150,7 +150,7 @@ void SoftmaxWithLossLayer::Forward_gpu( Dtype* counts = prob_.mutable_gpu_diff(); // NOLINT_NEXT_LINE(whitespace/operators) SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); Dtype loss; caffe_gpu_asum(nthreads, loss_data, &loss); if (normalize_) { @@ -167,12 +167,12 @@ void SoftmaxWithLossLayer::Forward_gpu( } } -template +template void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); @@ -188,7 +188,7 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, Dtype* counts = prob_.mutable_gpu_diff(); // NOLINT_NEXT_LINE(whitespace/operators) SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); const Dtype loss_weight = top[0]->cpu_diff()[0]; if (normalize_) { Dtype count; diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 0ad8179a..8b19d293 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -6,9 +6,9 @@ namespace caffe { -template +template void SplitLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { count_ = bottom[0]->count(); for (int i = 0; i < top.size(); ++i) { // Do not allow in-place computation in the SplitLayer. Instead, share data @@ -17,25 +17,25 @@ void SplitLayer::Reshape(const vector*>& bottom, // blob of the first split output with the input, but this seems to cause // some strange effects in practice...) CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; + "allow in-place computation."; top[i]->ReshapeLike(*bottom[0]); CHECK_EQ(count_, top[i]->count()); } gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", - NULL); + NULL); } -template +template void SplitLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 0; i < top.size(); ++i) { top[i]->ShareData(*bottom[0]); } } -template +template void SplitLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -44,7 +44,7 @@ void SplitLayer::Backward_cpu(const vector*>& top, return; } caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), - bottom[0]->mutable_cpu_diff()); + bottom[0]->mutable_cpu_diff()); // Add remaining top blob diffs. for (int i = 2; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); @@ -53,17 +53,17 @@ void SplitLayer::Backward_cpu(const vector*>& top, } } -template +template void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 0; i < top.size(); ++i) { top[i]->ShareData(*bottom[0]); } } -template +template void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -72,7 +72,7 @@ void SplitLayer::Backward_gpu(const vector*>& top, return; } caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); + bottom[0]->mutable_gpu_diff()); // Add remaining top blob diffs. for (int i = 2; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp index bfc7778c..4c630fb7 100644 --- a/src/caffe/layers/spp_layer.cpp +++ b/src/caffe/layers/spp_layer.cpp @@ -13,9 +13,9 @@ namespace caffe { using std::min; using std::max; -template +template LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param) { + const int bottom_h, const int bottom_w, const SPPParameter spp_param) { LayerParameter pooling_param; int num_bins = pow(2, pyramid_level); @@ -44,15 +44,15 @@ LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, switch (spp_param.pool()) { case SPPParameter_PoolMethod_MAX: pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); + PoolingParameter_PoolMethod_MAX); break; case SPPParameter_PoolMethod_AVE: pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); + PoolingParameter_PoolMethod_AVE); break; case SPPParameter_PoolMethod_STOCHASTIC: pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); + PoolingParameter_PoolMethod_STOCHASTIC); break; default: LOG(FATAL) << "Unknown pooling method."; @@ -61,9 +61,9 @@ LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, return pooling_param; } -template +template void SPPLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { SPPParameter spp_param = this->layer_param_.spp_param(); bottom_h_ = bottom[0]->height(); @@ -104,10 +104,10 @@ void SPPLayer::LayerSetUp(const vector*>& bottom, // pooling layer setup LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); + i, bottom_h_, bottom_w_, spp_param); pooling_layers_.push_back(shared_ptr < PoolingLayer > ( - new PoolingLayer(pooling_param))); + new PoolingLayer(pooling_param))); pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); // flatten layer output holders setup @@ -130,11 +130,11 @@ void SPPLayer::LayerSetUp(const vector*>& bottom, concat_layer_->SetUp(concat_bottom_vec_, top); } -template +template void SPPLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; + << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); bottom_h_ = bottom[0]->height(); bottom_w_ = bottom[0]->width(); @@ -142,36 +142,36 @@ void SPPLayer::Reshape(const vector*>& bottom, split_layer_->Reshape(bottom, split_top_vec_); for (int i = 0; i < pyramid_height_; i++) { LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); + i, bottom_h_, bottom_w_, spp_param); pooling_layers_[i].reset( - new PoolingLayer(pooling_param)); + new PoolingLayer(pooling_param)); pooling_layers_[i]->SetUp( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); pooling_layers_[i]->Reshape( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); flatten_layers_[i]->Reshape( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + *pooling_top_vecs_[i], *flatten_top_vecs_[i]); } concat_layer_->Reshape(concat_bottom_vec_, top); } -template +template void SPPLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { split_layer_->Forward(bottom, split_top_vec_); for (int i = 0; i < pyramid_height_; i++) { pooling_layers_[i]->Forward( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); flatten_layers_[i]->Forward( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + *pooling_top_vecs_[i], *flatten_top_vecs_[i]); } concat_layer_->Forward(concat_bottom_vec_, top); } -template +template void SPPLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -179,9 +179,9 @@ void SPPLayer::Backward_cpu(const vector*>& top, concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); for (int i = 0; i < pyramid_height_; i++) { flatten_layers_[i]->Backward( - *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); + *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); pooling_layers_[i]->Backward( - *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); + *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); } split_layer_->Backward(split_top_vec_, propagate_down, bottom); } diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 16405761..52a8a8c7 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -10,9 +10,9 @@ namespace caffe { -template +template void TanHLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); @@ -21,10 +21,10 @@ void TanHLayer::Forward_cpu(const vector*>& bottom, } } -template +template void TanHLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, + const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -38,9 +38,9 @@ void TanHLayer::Backward_cpu(const vector*>& top, } } -template +template void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); @@ -48,9 +48,9 @@ void TanHLayer::Forward_gpu(const vector*>& bottom, TanHForward(count, bottom_data, top_data); } -template +template void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index ca14de00..7d99226f 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -6,16 +6,16 @@ namespace caffe { -template +template void ThresholdLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); threshold_ = this->layer_param_.threshold_param().threshold(); } -template +template void ThresholdLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); @@ -24,9 +24,9 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, } } -template +template void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count(); diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index 0525b640..68b1b1e5 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -25,14 +25,14 @@ namespace caffe { -template +template WindowDataLayer::~WindowDataLayer() { this->JoinPrefetchThread(); } -template +template void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // LayerSetUp runs through the window_file and creates two structures // that hold windows: one for foreground (object) windows and one // for background (non-object) windows. We use an overlap threshold @@ -49,23 +49,23 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, // class_index overlap x1 y1 x2 y2 LOG(INFO) << "Window data layer:" << std::endl - << " foreground (object) overlap threshold: " - << this->layer_param_.window_data_param().fg_threshold() << std::endl - << " background (non-object) overlap threshold: " - << this->layer_param_.window_data_param().bg_threshold() << std::endl - << " foreground sampling fraction: " - << this->layer_param_.window_data_param().fg_fraction() << std::endl - << " cache_images: " - << this->layer_param_.window_data_param().cache_images() << std::endl - << " root_folder: " - << this->layer_param_.window_data_param().root_folder(); + << " foreground (object) overlap threshold: " + << this->layer_param_.window_data_param().fg_threshold() << std::endl + << " background (non-object) overlap threshold: " + << this->layer_param_.window_data_param().bg_threshold() << std::endl + << " foreground sampling fraction: " + << this->layer_param_.window_data_param().fg_fraction() << std::endl + << " cache_images: " + << this->layer_param_.window_data_param().cache_images() << std::endl + << " root_folder: " + << this->layer_param_.window_data_param().root_folder(); cache_images_ = this->layer_param_.window_data_param().cache_images(); string root_folder = this->layer_param_.window_data_param().root_folder(); const bool prefetch_needs_rand = - this->transform_param_.mirror() || - this->transform_param_.crop_size(); + this->transform_param_.mirror() || + this->transform_param_.crop_size(); if (prefetch_needs_rand) { const unsigned int prefetch_rng_seed = caffe_rng_rand(); prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); @@ -75,7 +75,7 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, std::ifstream infile(this->layer_param_.window_data_param().source().c_str()); CHECK(infile.good()) << "Failed to open window file " - << this->layer_param_.window_data_param().source() << std::endl; + << this->layer_param_.window_data_param().source() << std::endl; map label_hist; label_hist.insert(std::make_pair(0, 0)); @@ -109,9 +109,9 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, int num_windows; infile >> num_windows; const float fg_threshold = - this->layer_param_.window_data_param().fg_threshold(); + this->layer_param_.window_data_param().fg_threshold(); const float bg_threshold = - this->layer_param_.window_data_param().bg_threshold(); + this->layer_param_.window_data_param().bg_threshold(); for (int i = 0; i < num_windows; ++i) { int label, x1, y1, x2, y2; float overlap; @@ -144,27 +144,27 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, if (image_index % 100 == 0) { LOG(INFO) << "num: " << image_index << " " - << image_path << " " - << image_size[0] << " " - << image_size[1] << " " - << image_size[2] << " " - << "windows to process: " << num_windows; + << image_path << " " + << image_size[0] << " " + << image_size[1] << " " + << image_size[2] << " " + << "windows to process: " << num_windows; } } while (infile >> hashtag >> image_index); LOG(INFO) << "Number of images: " << image_index + 1; for (map::iterator it = label_hist.begin(); - it != label_hist.end(); ++it) { + it != label_hist.end(); ++it) { LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] - << " samples"; + << " samples"; } LOG(INFO) << "Amount of context padding: " - << this->layer_param_.window_data_param().context_pad(); + << this->layer_param_.window_data_param().context_pad(); LOG(INFO) << "Crop mode: " - << this->layer_param_.window_data_param().crop_mode(); + << this->layer_param_.window_data_param().crop_mode(); // image const int crop_size = this->transform_param_.crop_size(); @@ -174,8 +174,8 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); // label vector label_shape(1, batch_size); top[1]->Reshape(label_shape); @@ -186,7 +186,7 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, has_mean_values_ = this->transform_param_.mean_value_size() > 0; if (has_mean_file_) { const string& mean_file = - this->transform_param_.mean_file(); + this->transform_param_.mean_file(); LOG(INFO) << "Loading mean file from: " << mean_file; BlobProto blob_proto; ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); @@ -194,12 +194,12 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, } if (has_mean_values_) { CHECK(has_mean_file_ == false) << - "Cannot specify mean_file and mean_value at the same time"; + "Cannot specify mean_file and mean_value at the same time"; for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { mean_values_.push_back(this->transform_param_.mean_value(c)); } CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) << - "Specify either 1 mean_value or as many as channels: " << channels; + "Specify either 1 mean_value or as many as channels: " << channels; if (channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < channels; ++c) { @@ -209,16 +209,16 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, } } -template +template unsigned int WindowDataLayer::PrefetchRand() { CHECK (prefetch_rng_); caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); + static_cast(prefetch_rng_->generator()); return (*prefetch_rng)(); } // Thread fetching the data -template +template void WindowDataLayer::InternalThreadEntry() { // At each iteration, sample N windows where N*p are foreground (object) // windows and N*(1-p) are background (non-object) windows @@ -235,7 +235,7 @@ void WindowDataLayer::InternalThreadEntry() { const int crop_size = this->transform_param_.crop_size(); const bool mirror = this->transform_param_.mirror(); const float fg_fraction = - this->layer_param_.window_data_param().fg_fraction(); + this->layer_param_.window_data_param().fg_fraction(); Dtype* mean = NULL; int mean_off = 0; int mean_width = 0; @@ -255,7 +255,7 @@ void WindowDataLayer::InternalThreadEntry() { caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); const int num_fg = static_cast(static_cast(batch_size) - * fg_fraction); + * fg_fraction); const int num_samples[2] = { batch_size - num_fg, num_fg }; int item_id = 0; @@ -266,20 +266,20 @@ void WindowDataLayer::InternalThreadEntry() { timer.Start(); const unsigned int rand_index = PrefetchRand(); vector window = - (is_fg) ? - fg_windows_[rand_index % fg_windows_.size()] : - bg_windows_[rand_index % bg_windows_.size()]; + (is_fg) ? + fg_windows_[rand_index % fg_windows_.size()] : + bg_windows_[rand_index % bg_windows_.size()]; bool do_mirror = mirror && PrefetchRand() % 2; // load the image containing the window pair > image = - image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; cv::Mat cv_img; if (this->cache_images_) { pair < std::string, Datum > image_cached = - image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; cv_img = DecodeDatumToCVMat(image_cached.second, true); } else { cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); @@ -305,7 +305,7 @@ void WindowDataLayer::InternalThreadEntry() { // such that after warping the expanded region to crop_size x crop_size // there's exactly context_pad amount of padding on each side Dtype context_scale = static_cast(crop_size) / - static_cast(crop_size - 2 * context_pad); + static_cast(crop_size - 2 * context_pad); // compute the expanded region Dtype half_height = static_cast(y2 - y1 + 1) / 2.0; @@ -349,15 +349,16 @@ void WindowDataLayer::InternalThreadEntry() { // scale factors that would be used to warp the unclipped // expanded region Dtype scale_x = - static_cast(crop_size) / static_cast(unclipped_width); + static_cast(crop_size) / static_cast(unclipped_width); Dtype scale_y = - static_cast(crop_size) / static_cast(unclipped_height); + static_cast(crop_size) + / static_cast(unclipped_height); // size to warp the clipped expanded region to cv_crop_size.width = - static_cast(round(static_cast(clipped_width) * scale_x)); + static_cast(round(static_cast(clipped_width) * scale_x)); cv_crop_size.height = - static_cast(round(static_cast(clipped_height) * scale_y)); + static_cast(round(static_cast(clipped_height) * scale_y)); pad_x1 = static_cast(round(static_cast(pad_x1) * scale_x)); pad_x2 = static_cast(round(static_cast(pad_x2) * scale_x)); pad_y1 = static_cast(round(static_cast(pad_y1) * scale_y)); @@ -384,7 +385,7 @@ void WindowDataLayer::InternalThreadEntry() { cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1); cv::Mat cv_cropped_img = cv_img(roi); cv::resize(cv_cropped_img, cv_cropped_img, - cv_crop_size, 0, 0, cv::INTER_LINEAR); + cv_crop_size, 0, 0, cv::INTER_LINEAR); // horizontal flip at random if (do_mirror) { @@ -398,12 +399,12 @@ void WindowDataLayer::InternalThreadEntry() { for (int w = 0; w < cv_cropped_img.cols; ++w) { for (int c = 0; c < channels; ++c) { int top_index = ((item_id * channels + c) * crop_size + h + pad_h) - * crop_size + w + pad_w; + * crop_size + w + pad_w; // int top_index = (c * height + h) * width + w; Dtype pixel = static_cast(ptr[img_index++]); if (this->has_mean_file_) { int mean_index = (c * mean_height + h + mean_off + pad_h) - * mean_width + w + mean_off + pad_w; + * mean_width + w + mean_off + pad_w; top_data[top_index] = (pixel - mean[mean_index]) * scale; } else { if (this->has_mean_values_) { diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 53ec5461..23085112 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -19,12 +19,12 @@ namespace caffe { -template +template Net::Net(const NetParameter& param) { Init(param); } -template +template Net::Net(const string& param_file, Phase phase) { NetParameter param; ReadNetParamsFromTextFileOrDie(param_file, ¶m); @@ -32,7 +32,7 @@ Net::Net(const string& param_file, Phase phase) { Init(param); } -template +template void Net::Init(const NetParameter& in_param) { // Set phase from the state. phase_ = in_param.state().phase(); @@ -41,7 +41,7 @@ void Net::Init(const NetParameter& in_param) { NetParameter filtered_param; FilterNet(in_param, &filtered_param); LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); + << filtered_param.DebugString(); // Create a copy of filtered_param with splits added where necessary. NetParameter param; InsertSplits(filtered_param, ¶m); @@ -50,14 +50,14 @@ void Net::Init(const NetParameter& in_param) { map blob_name_to_idx; set < string > available_blobs; CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) - << "Must specify either input_shape OR deprecated input_dim, not both."; + << "Must specify either input_shape OR deprecated input_dim, not both."; if (param.input_dim_size() > 0) { // Deprecated 4D dimensions. CHECK_EQ(param.input_size() * 4, param.input_dim_size()) - << "Incorrect input blob dimension specifications."; + << "Incorrect input blob dimension specifications."; } else { CHECK_EQ(param.input_size(), param.input_shape_size()) - << "Exactly one input_shape must be specified per input."; + << "Exactly one input_shape must be specified per input."; } memory_used_ = 0; // set the input blobs @@ -82,9 +82,9 @@ void Net::Init(const NetParameter& in_param) { const LayerParameter& layer_param = param.layer(layer_id); if (layer_param.propagate_down_size() > 0) { CHECK_EQ(layer_param.propagate_down_size(), - layer_param.bottom_size()) - << "propagate_down param must be specified " - << "either 0 or bottom_size times "; + layer_param.bottom_size()) + << "propagate_down param must be specified " + << "either 0 or bottom_size times "; } layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param)); layer_names_.push_back(layer_param.name()); @@ -93,9 +93,9 @@ void Net::Init(const NetParameter& in_param) { // Figure out this layer's input and output for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); - ++bottom_id) { + ++bottom_id) { const int blob_id = AppendBottom(param, layer_id, bottom_id, - &available_blobs, &blob_name_to_idx); + &available_blobs, &blob_name_to_idx); // If a blob needs backward, this layer should provide it. need_backward |= blob_need_backward_[blob_id]; } @@ -109,7 +109,7 @@ void Net::Init(const NetParameter& in_param) { Layer < Dtype > *layer = layers_[layer_id].get(); if (layer->AutoTopBlobs()) { const int needed_num_top = - std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); + std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); for (; num_top < needed_num_top; ++num_top) { // Add "anonymous" top blobs -- do not modify available_blobs or // blob_name_to_idx as we don't want these blobs to be usable as input @@ -135,17 +135,17 @@ void Net::Init(const NetParameter& in_param) { const int param_size = layer_param.param_size(); const int num_param_blobs = layers_[layer_id]->blobs().size(); CHECK_LE(param_size, num_param_blobs) - << "Too many params specified for layer " << layer_param.name(); + << "Too many params specified for layer " << layer_param.name(); ParamSpec default_param_spec; for (int param_id = 0; param_id < num_param_blobs; ++param_id) { const ParamSpec* param_spec = - (param_id < param_size) ? - &layer_param.param(param_id) : - &default_param_spec; + (param_id < param_size) ? + &layer_param.param(param_id) : + &default_param_spec; const bool param_need_backward = param_spec->lr_mult() > 0; need_backward |= param_need_backward; layers_[layer_id]->set_param_propagate_down(param_id, - param_need_backward); + param_need_backward); } for (int param_id = 0; param_id < num_param_blobs; ++param_id) { AppendParam(param, layer_id, param_id); @@ -172,7 +172,7 @@ void Net::Init(const NetParameter& in_param) { for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; if (layers_[layer_id]->loss(top_id) || - (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { + (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { layer_contributes_loss = true; } if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { @@ -186,7 +186,7 @@ void Net::Init(const NetParameter& in_param) { if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { layer_need_backward_[layer_id] = false; for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { + ++bottom_id) { bottom_need_backward_[layer_id][bottom_id] = false; } } @@ -197,20 +197,20 @@ void Net::Init(const NetParameter& in_param) { LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; } else { LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; + << " does not need backward computation."; } for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { + ++bottom_id) { if (layer_contributes_loss) { const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; blobs_under_loss.insert(blob_name); } else { bottom_need_backward_[layer_id][bottom_id] = false; } if (!bottom_need_backward_[layer_id][bottom_id]) { const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; blobs_skip_backp.insert(blob_name); } } @@ -220,23 +220,23 @@ void Net::Init(const NetParameter& in_param) { for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { layer_need_backward_[layer_id] = true; for (int bottom_id = 0; - bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { + bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { bottom_need_backward_[layer_id][bottom_id] = - bottom_need_backward_[layer_id][bottom_id] || - layers_[layer_id]->AllowForceBackward(bottom_id); + bottom_need_backward_[layer_id][bottom_id] || + layers_[layer_id]->AllowForceBackward(bottom_id); blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || - bottom_need_backward_[layer_id][bottom_id]; + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || + bottom_need_backward_[layer_id][bottom_id]; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { + ++param_id) { layers_[layer_id]->set_param_propagate_down(param_id, true); } } } // In the end, all remaining blobs are considered output blobs. for (set::iterator it = available_blobs.begin(); - it != available_blobs.end(); ++it) { + it != available_blobs.end(); ++it) { LOG(INFO) << "This network produces output " << *it; net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); net_output_blob_indices_.push_back(blob_name_to_idx[*it]); @@ -253,9 +253,9 @@ void Net::Init(const NetParameter& in_param) { LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); } -template +template void Net::FilterNet(const NetParameter& param, - NetParameter* param_filtered) { + NetParameter* param_filtered) { NetState net_state(param.state()); param_filtered->CopyFrom(param); param_filtered->clear_layer(); @@ -263,7 +263,7 @@ void Net::FilterNet(const NetParameter& param, const LayerParameter& layer_param = param.layer(i); const string& layer_name = layer_param.name(); CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) - << "Specify either include rules or exclude rules; not both."; + << "Specify either include rules or exclude rules; not both."; // If no include rules are specified, the layer is included by default and // only excluded if it meets one of the exclude rules. bool layer_included = (layer_param.include_size() == 0); @@ -283,15 +283,15 @@ void Net::FilterNet(const NetParameter& param, } } -template +template bool Net::StateMeetsRule(const NetState& state, - const NetStateRule& rule, const string& layer_name) { + const NetStateRule& rule, const string& layer_name) { // Check whether the rule is broken due to phase. if (rule.has_phase()) { if (rule.phase() != state.phase()) { LOG(INFO) << "The NetState phase (" << state.phase() - << ") differed from the phase (" << rule.phase() - << ") specified by a rule in layer " << layer_name; + << ") differed from the phase (" << rule.phase() + << ") specified by a rule in layer " << layer_name; return false; } } @@ -299,8 +299,8 @@ bool Net::StateMeetsRule(const NetState& state, if (rule.has_min_level()) { if (state.level() < rule.min_level()) { LOG(INFO) << "The NetState level (" << state.level() - << ") is above the min_level (" << rule.min_level() - << ") specified by a rule in layer " << layer_name; + << ") is above the min_level (" << rule.min_level() + << ") specified by a rule in layer " << layer_name; return false; } } @@ -308,8 +308,8 @@ bool Net::StateMeetsRule(const NetState& state, if (rule.has_max_level()) { if (state.level() > rule.max_level()) { LOG(INFO) << "The NetState level (" << state.level() - << ") is above the max_level (" << rule.max_level() - << ") specified by a rule in layer " << layer_name; + << ") is above the max_level (" << rule.max_level() + << ") specified by a rule in layer " << layer_name; return false; } } @@ -325,7 +325,7 @@ bool Net::StateMeetsRule(const NetState& state, } if (!has_stage) { LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; + << "' specified by a rule in layer " << layer_name; return false; } } @@ -341,7 +341,7 @@ bool Net::StateMeetsRule(const NetState& state, } if (has_stage) { LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; + << "' specified by a rule in layer " << layer_name; return false; } } @@ -350,30 +350,30 @@ bool Net::StateMeetsRule(const NetState& state, // Helper for Net::Init: add a new input or top blob to the net. (Inputs have // layer_id == -1, tops have layer_id >= 0.) -template +template void Net::AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx) { + const int top_id, set* available_blobs, + map* blob_name_to_idx) { shared_ptr < LayerParameter - > layer_param( - (layer_id >= 0) ? - (new LayerParameter(param.layer(layer_id))) : - NULL); + > layer_param( + (layer_id >= 0) ? + (new LayerParameter(param.layer(layer_id))) : + NULL); const string& blob_name = - layer_param ? - (layer_param->top_size() > top_id ? - layer_param->top(top_id) : - "(automatic)") : - param.input(top_id); + layer_param ? + (layer_param->top_size() > top_id ? + layer_param->top(top_id) : + "(automatic)") : + param.input(top_id); // Check if we are doing in-place computation if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && - blob_name == layer_param->bottom(top_id)) { + blob_name == layer_param->bottom(top_id)) { // In-place computation LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); } else if (blob_name_to_idx && - blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { + blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { // If we are not doing in-place computation but have duplicated blobs, // raise an error. LOG(FATAL) << "Duplicate blobs produced by multiple sources."; @@ -396,9 +396,9 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, // Set the (explicitly specified) dimensions of the input blob. if (param.input_dim_size() > 0) { blob_pointer->Reshape(param.input_dim(top_id * 4), - param.input_dim(top_id * 4 + 1), - param.input_dim(top_id * 4 + 2), - param.input_dim(top_id * 4 + 3)); + param.input_dim(top_id * 4 + 1), + param.input_dim(top_id * 4 + 2), + param.input_dim(top_id * 4 + 3)); } else { blob_pointer->Reshape(param.input_shape(top_id)); } @@ -415,15 +415,15 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, } // Helper for Net::Init: add a new bottom blob to the net. -template +template int Net::AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx) { + const int bottom_id, set* available_blobs, + map* blob_name_to_idx) { const LayerParameter& layer_param = param.layer(layer_id); const string& blob_name = layer_param.bottom(bottom_id); if (available_blobs->find(blob_name) == available_blobs->end()) { LOG(FATAL) << "Unknown blob input " << blob_name - << " (at index " << bottom_id << ") to layer " << layer_id; + << " (at index " << bottom_id << ") to layer " << layer_id; } const int blob_id = (*blob_name_to_idx)[blob_name]; LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; @@ -435,18 +435,18 @@ int Net::AppendBottom(const NetParameter& param, const int layer_id, if (layer_param.propagate_down_size() > 0) propagate_down = layer_param.propagate_down(bottom_id); const bool need_backward = blob_need_backward_[blob_id] && - propagate_down; + propagate_down; bottom_need_backward_[layer_id].push_back(need_backward); return blob_id; } -template +template void Net::AppendParam(const NetParameter& param, const int layer_id, - const int param_id) { + const int param_id) { const LayerParameter& layer_param = layers_[layer_id]->layer_param(); const int param_size = layer_param.param_size(); string param_name = - (param_size > param_id) ? layer_param.param(param_id).name() : ""; + (param_size > param_id) ? layer_param.param(param_id).name() : ""; if (param_name.size()) { param_display_names_.push_back(param_name); } else { @@ -459,7 +459,7 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, param_id_vecs_[layer_id].push_back(net_param_id); param_layer_indices_.push_back(make_pair(layer_id, param_id)); if (!param_size || !param_name.size() || (param_name.size() && - param_names_index_.find(param_name) == param_names_index_.end())) { + param_names_index_.find(param_name) == param_names_index_.end())) { // This layer "owns" this parameter blob -- it is either anonymous // (i.e., not given a param_name) or explicitly given a name that we // haven't already seen. @@ -472,31 +472,31 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, const int owner_net_param_id = param_names_index_[param_name]; param_owners_.push_back(owner_net_param_id); const pair& owner_index = - param_layer_indices_[owner_net_param_id]; + param_layer_indices_[owner_net_param_id]; const int owner_layer_id = owner_index.first; const int owner_param_id = owner_index.second; LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; + << "layer '" << layer_names_[owner_layer_id] << "', param " + << "index " << owner_param_id; Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get(); Blob < Dtype > *owner_blob = - layers_[owner_layer_id]->blobs()[owner_param_id].get(); + layers_[owner_layer_id]->blobs()[owner_param_id].get(); const int param_size = layer_param.param_size(); if (param_size > param_id && (layer_param.param(param_id).share_mode() == - ParamSpec_DimCheckMode_PERMISSIVE)) { + ParamSpec_DimCheckMode_PERMISSIVE)) { // Permissive dimension checking -- only check counts are the same. CHECK_EQ(this_blob->count(), owner_blob->count()) - << "Shared parameter blobs must have the same count."; + << "Shared parameter blobs must have the same count."; } else { // Strict dimension checking -- all dims must be the same. CHECK(this_blob->shape() == owner_blob->shape()); } layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); + *layers_[owner_layer_id]->blobs()[owner_param_id]); } } -template +template void Net::GetLearningRateAndWeightDecay() { LOG(INFO) << "Collecting Learning Rate and Weight Decay."; ParamSpec default_param_spec; @@ -504,15 +504,15 @@ void Net::GetLearningRateAndWeightDecay() { vector < shared_ptr > > &layer_blobs = layers_[i]->blobs(); for (int j = 0; j < layer_blobs.size(); ++j) { const ParamSpec* param_spec = - (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; + (layers_[i]->layer_param().param_size() > j) ? + &layers_[i]->layer_param().param(j) : &default_param_spec; params_lr_.push_back(param_spec->lr_mult()); params_weight_decay_.push_back(param_spec->decay_mult()); } } } -template +template Dtype Net::ForwardFromTo(int start, int end) { CHECK_GE(start, 0); CHECK_LT(end, layers_.size()); @@ -537,7 +537,7 @@ Dtype Net::ForwardFromTo(int start, int end) { clFinish(amdDevice.CommandQueue); layer_timer.Stop(); printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), - layer_timer.MilliSeconds()); + layer_timer.MilliSeconds()); } forward_timer.Stop(); @@ -546,17 +546,17 @@ Dtype Net::ForwardFromTo(int start, int end) { return loss; } -template +template Dtype Net::ForwardFrom(int start) { return ForwardFromTo(start, layers_.size() - 1); } -template +template Dtype Net::ForwardTo(int end) { return ForwardFromTo(0, end); } -template +template const vector*>& Net::ForwardPrefilled(Dtype* loss) { if (loss != NULL) { *loss = ForwardFromTo(0, layers_.size() - 1); @@ -566,9 +566,9 @@ const vector*>& Net::ForwardPrefilled(Dtype* loss) { return net_output_blobs_; } -template +template const vector*>& Net::Forward( - const vector*> & bottom, Dtype* loss) { + const vector*> & bottom, Dtype* loss) { // Copy bottom to internal bottom for (int i = 0; i < bottom.size(); ++i) { net_input_blobs_[i]->CopyFrom(*bottom[i]); @@ -576,13 +576,13 @@ const vector*>& Net::Forward( return ForwardPrefilled(loss); } -template +template string Net::Forward(const string& input_blob_protos, Dtype* loss) { BlobProtoVector blob_proto_vec; if (net_input_blobs_.size()) { blob_proto_vec.ParseFromString(input_blob_protos); CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) - << "Incorrect input size."; + << "Incorrect input size."; for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); } @@ -597,7 +597,7 @@ string Net::Forward(const string& input_blob_protos, Dtype* loss) { return output; } -template +template void Net::BackwardFromTo(int start, int end) { CHECK_GE(end, 0); CHECK_LT(start, layers_.size()); @@ -610,14 +610,14 @@ void Net::BackwardFromTo(int start, int end) { layer_timer.Start(); if (layer_need_backward_[i]) { layers_[i]->Backward( - top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); + top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); if (debug_info_) { BackwardDebugInfo(i); } clFinish(amdDevice.CommandQueue); layer_timer.Start(); printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), - layer_timer.MilliSeconds()); + layer_timer.MilliSeconds()); } } @@ -625,38 +625,38 @@ void Net::BackwardFromTo(int start, int end) { printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); } -template +template void Net::InputDebugInfo(const int input_id) { const Blob& blob = *net_input_blobs_[input_id]; const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; + << "Input " << blob_name << " data: " << data_abs_val_mean; } -template +template void Net::ForwardDebugInfo(const int layer_id) { for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { const Blob& blob = *top_vecs_[layer_id][top_id]; const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << data_abs_val_mean; + << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name + << " data: " << data_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { + ++param_id) { const Blob& blob = *layers_[layer_id]->blobs()[param_id]; const int net_param_id = param_id_vecs_[layer_id][param_id]; const string& blob_name = param_display_names_[net_param_id]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name - << " data: " << data_abs_val_mean; + << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name + << " data: " << data_abs_val_mean; } } -template +template void Net::BackwardDebugInfo(const int layer_id) { const vector*>& bottom_vec = bottom_vecs_[layer_id]; for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { @@ -667,23 +667,23 @@ void Net::BackwardDebugInfo(const int layer_id) { const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; + << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name + << " diff: " << diff_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { + ++param_id) { if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } const Blob& blob = *layers_[layer_id]->blobs()[param_id]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; + << "Layer " << layer_names_[layer_id] << ", param blob " << param_id + << " diff: " << diff_abs_val_mean; } } -template +template void Net::UpdateDebugInfo(const int param_id) { const Blob& blob = *params_[param_id]; const int param_owner = param_owners_[param_id]; @@ -693,20 +693,20 @@ void Net::UpdateDebugInfo(const int param_id) { if (param_owner < 0) { const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; + << ", param " << param_display_name + << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; } else { const string& owner_layer_name = - layer_names_[param_layer_indices_[param_owner].first]; + layer_names_[param_layer_indices_[param_owner].first]; LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " - << "param " << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << diff_abs_val_mean; + << ", param blob " << param_display_name + << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << diff_abs_val_mean; } } -template +template void Net::ShareTrainedLayersWith(const Net* other) { int num_source_layers = other->layers().size(); for (int i = 0; i < num_source_layers; ++i) { @@ -714,7 +714,7 @@ void Net::ShareTrainedLayersWith(const Net* other) { const string& source_layer_name = other->layer_names()[i]; int target_layer_id = 0; while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { + layer_names_[target_layer_id] != source_layer_name) { ++target_layer_id; } if (target_layer_id == layer_names_.size()) { @@ -723,9 +723,9 @@ void Net::ShareTrainedLayersWith(const Net* other) { } DLOG(INFO) << "Copying source layer " << source_layer_name; vector < shared_ptr > > &target_blobs = - layers_[target_layer_id]->blobs(); + layers_[target_layer_id]->blobs(); CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) - << "Incompatible number of blobs for layer " << source_layer_name; + << "Incompatible number of blobs for layer " << source_layer_name; for (int j = 0; j < target_blobs.size(); ++j) { Blob < Dtype > *source_blob = source_layer->blobs()[j].get(); CHECK(target_blobs[j]->shape() == source_blob->shape()); @@ -734,17 +734,17 @@ void Net::ShareTrainedLayersWith(const Net* other) { } } -template +template void Net::BackwardFrom(int start) { BackwardFromTo(start, 0); } -template +template void Net::BackwardTo(int end) { BackwardFromTo(layers_.size() - 1, end); } -template +template void Net::Backward() { BackwardFromTo(layers_.size() - 1, 0); if (debug_info_) { @@ -761,19 +761,19 @@ void Net::Backward() { const Dtype l2norm_data = std::sqrt(sumsq_data); const Dtype l2norm_diff = std::sqrt(sumsq_diff); LOG(ERROR) << " [Backward] All net params (data, diff): " - << "L1 norm = (" << asum_data << ", " << asum_diff << "); " - << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; + << "L1 norm = (" << asum_data << ", " << asum_diff << "); " + << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; } } -template +template void Net::Reshape() { for (int i = 0; i < layers_.size(); ++i) { layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); } } -template +template void Net::CopyTrainedLayersFrom(const NetParameter& param) { int num_source_layers = param.layer_size(); for (int i = 0; i < num_source_layers; ++i) { @@ -781,7 +781,7 @@ void Net::CopyTrainedLayersFrom(const NetParameter& param) { const string& source_layer_name = source_layer.name(); int target_layer_id = 0; while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { + layer_names_[target_layer_id] != source_layer_name) { ++target_layer_id; } if (target_layer_id == layer_names_.size()) { @@ -790,9 +790,9 @@ void Net::CopyTrainedLayersFrom(const NetParameter& param) { } DLOG(INFO) << "Copying source layer " << source_layer_name; vector < shared_ptr > > &target_blobs = - layers_[target_layer_id]->blobs(); + layers_[target_layer_id]->blobs(); CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) - << "Incompatible number of blobs for layer " << source_layer_name; + << "Incompatible number of blobs for layer " << source_layer_name; for (int j = 0; j < target_blobs.size(); ++j) { const bool kReshape = false; target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); @@ -800,14 +800,14 @@ void Net::CopyTrainedLayersFrom(const NetParameter& param) { } } -template +template void Net::CopyTrainedLayersFrom(const string trained_filename) { NetParameter param; ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); CopyTrainedLayersFrom(param); } -template +template void Net::ToProto(NetParameter* param, bool write_diff) const { param->Clear(); param->set_name(name_); @@ -828,7 +828,7 @@ void Net::ToProto(NetParameter* param, bool write_diff) const { } } -template +template void Net::Update() { // First, accumulate the diffs of any shared parameters into their owner's // diff. (Assumes that the learning rate, weight decay, etc. have already been @@ -878,14 +878,14 @@ void Net::Update() { } } -template +template bool Net::has_blob(const string& blob_name) const { return blob_names_index_.find(blob_name) != blob_names_index_.end(); } -template +template const shared_ptr > Net::blob_by_name( - const string& blob_name) const { + const string& blob_name) const { shared_ptr < Blob > blob_ptr; if (has_blob(blob_name)) { blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; @@ -896,14 +896,14 @@ const shared_ptr > Net::blob_by_name( return blob_ptr; } -template +template bool Net::has_layer(const string& layer_name) const { return layer_names_index_.find(layer_name) != layer_names_index_.end(); } -template +template const shared_ptr > Net::layer_by_name( - const string& layer_name) const { + const string& layer_name) const { shared_ptr < Layer > layer_ptr; if (has_layer(layer_name)) { layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl index 5da76b7e..f23ff9a3 100644 --- a/src/caffe/ocl/bnll_layer.cl +++ b/src/caffe/ocl/bnll_layer.cl @@ -38,7 +38,7 @@ template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLFor template __kernel void BNLLBackward(const int n, __global const T* in_diff, - __global const T* in_data, __global T* out_diff) { + __global const T* in_data, __global T* out_diff) { int index = get_global_id(0); if (index < n) { T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); @@ -47,6 +47,6 @@ __kernel void BNLLBackward(const int n, __global const T* in_diff, } template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff, - __global const float* in_data, __global float* out_diff); + __global const float* in_data, __global float* out_diff); template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff, - __global const double* in_data, __global double* out_diff); + __global const double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl index 2c2c76ee..ba5e1f54 100644 --- a/src/caffe/ocl/concat_layer.cl +++ b/src/caffe/ocl/concat_layer.cl @@ -26,29 +26,29 @@ template __kernel void Concat(const int nthreads, __global const T* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global T* out_data) { - int index = get_global_id(0); - if(index < nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward == 1) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward == 1) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } } -template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global float* out_data); -template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global double* out_data); +template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); +template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl index 0aeea80c..b6fdebc7 100644 --- a/src/caffe/ocl/contrastive_loss_layer.cl +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -26,9 +26,9 @@ template __kernel void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, - __global Dtype *bottom_diff) { + const Dtype margin, const bool legacy_version, const Dtype alpha, + __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, + __global Dtype *bottom_diff) { int i = get_global_id(0); if(i < count) { int n = i / channels; // the num index, to access y and dist_sq @@ -55,10 +55,10 @@ __kernel void CLLBackward(const int count, const int channels, } template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - __global const float* y, __global const float* diff, __global const float* dist_sq, - __global float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + __global const float* y, __global const float* diff, __global const float* dist_sq, + __global float *bottom_diff); template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - __global const double* y, __global const double* diff, __global const double* dist_sq, - __global double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + __global const double* y, __global const double* diff, __global const double* dist_sq, + __global double *bottom_diff); diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl index 0e1812d8..3f60a34f 100644 --- a/src/caffe/ocl/eltwise_layer.cl +++ b/src/caffe/ocl/eltwise_layer.cl @@ -26,8 +26,8 @@ template __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, - __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, - __global int* mask) { + __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, + __global int* mask) { int index = get_global_id(0); if(index < nthreads) { Dtype maxval = -FLT_MAX; @@ -49,15 +49,15 @@ __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a } } template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a, - __global const float* bottom_data_b, const int blob_idx, __global float* top_data, - __global int* mask); + __global const float* bottom_data_b, const int blob_idx, __global float* top_data, + __global int* mask); template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a, - __global const double* bottom_data_b, const int blob_idx, __global double* top_data, - __global int* mask); + __global const double* bottom_data_b, const int blob_idx, __global double* top_data, + __global int* mask); template __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, - const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { + const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { int index = get_global_id(0); if(index < nthreads) { Dtype gradient = 0; @@ -68,6 +68,6 @@ __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, } } template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff, - const int blob_idx, __global const int* mask, __global float* bottom_diff); + const int blob_idx, __global const int* mask, __global float* bottom_diff); template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff, - const int blob_idx, __global const int* mask, __global double* bottom_diff); + const int blob_idx, __global const int* mask, __global double* bottom_diff); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index c08d1310..46248024 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -94,11 +94,11 @@ template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_o template __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_col, const int col_offset) { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { data_im = data_im + img_offset; data_col = data_col + col_offset; @@ -128,22 +128,22 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in } template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global float* data_col, const int col_offset); + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global double* data_col, const int col_offset); + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); template __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_im, const int img_offset) { + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { data_col = data_col + col_offset; data_im = data_im + img_offset; int index = get_global_id(0); @@ -172,14 +172,14 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i } template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w,const int pad_h, const int pad_w, - const int stride_h, const int stride_w,const int height_col, const int width_col, - __global float* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, - const int col_offset, const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); template __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) { diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl index 1a53f772..e9938966 100644 --- a/src/caffe/ocl/lrn_layer.cl +++ b/src/caffe/ocl/lrn_layer.cl @@ -78,7 +78,7 @@ __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, co } } } -template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); template diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 3162b92e..786ddc16 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -163,11 +163,11 @@ template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void template __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, - __global int* mask, __global T* top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, __global T* const bottom_diff) { + __global int* mask, __global T* top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global T* const bottom_diff) { int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total) { @@ -246,11 +246,11 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave template __kernel void StoPoolBackward(const int nthreads, - __global Dtype* rand_idx, __global Dtype* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global Dtype* bottom_diff) { + __global Dtype* rand_idx, __global Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global Dtype* bottom_diff) { int index = get_global_id(0); int total = get_global_size(0); for(index; index < nthreads; index += total) { @@ -279,15 +279,15 @@ __kernel void StoPoolBackward(const int nthreads, } } -template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* bottom_diff); + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index 5fbea781..de46a5da 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -41,7 +41,7 @@ __kernel void PReLUBackward(const int count, const int channels, const int dim, if(index < count) { int c = (index / dim) % channels / div_factor; out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); + + (in_data[index] <= 0) * slope_data[c]); } } template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index 94a41db4..57b40dfe 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -47,9 +47,9 @@ enum r123_enum_threefry32x4 }; inline uint32_t RotL_32(uint32_t x, unsigned int N) - __attribute__((always_inline)); + __attribute__((always_inline)); inline uint32_t RotL_32(uint32_t x, unsigned int N) - { + { return (x << (N & 31)) | (x >> ((32 - N) & 31)); } @@ -58,10 +58,10 @@ typedef struct r123array4x32 threefry4x32_key_t; typedef struct r123array4x32 threefry4x32_ukey_t; inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, - threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); + threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, - threefry4x32_ctr_t in, threefry4x32_key_t k) - { + threefry4x32_ctr_t in, threefry4x32_key_t k) + { threefry4x32_ctr_t X; uint32_t ks[4 + 1]; int i; @@ -95,7 +95,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, X.v[2] += ks[2]; X.v[3] += ks[3]; if (Nrounds > 0) - { + { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; @@ -802,13 +802,13 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, template __kernel void PRNG_threefry4x32( - __global uint4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T inf, - T sup, - T threshold, - uint nrounds, - uint numrandom + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom ) { size_t gdx = get_global_id(0); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl index 4069ce16..6fe0daab 100644 --- a/src/caffe/ocl/softmax_layer.cl +++ b/src/caffe/ocl/softmax_layer.cl @@ -75,7 +75,7 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma template __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* out) { + const int spatial_dim, __global const T* data, __global T* out) { int index = get_global_id(0); if(index < num * spatial_dim) { int n = index / spatial_dim; @@ -89,14 +89,14 @@ __kernel void kernel_channel_max(const int num, const int channels, } template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* out); + const int spatial_dim, __global const float* data, __global float* out); template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* out); + const int spatial_dim, __global const double* data, __global double* out); template __kernel void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_max, __global T* data) { + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { int index = get_global_id(0); if(index < count) { int n = index / channels / spatial_dim; @@ -109,7 +109,7 @@ template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel template __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* channel_sum) { + const int spatial_dim, __global const T* data, __global T* channel_sum) { int index = get_global_id(0); if(index < num * spatial_dim) { int n = index / spatial_dim; @@ -123,14 +123,14 @@ __kernel void kernel_channel_sum(const int num, const int channels, } template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* channel_sum); + const int spatial_dim, __global const float* data, __global float* channel_sum); template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* channel_sum); + const int spatial_dim, __global const double* data, __global double* channel_sum); template __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_sum, __global T* data) { + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { int index = get_global_id(0); if(index < count) { int n = index / channels / spatial_dim; @@ -140,16 +140,16 @@ __kernel void kernel_channel_div(const int count, } template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const float* channel_sum, __global float* data); + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const double* channel_sum, __global double* data); + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); template __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const T* data_1, __global const T* data_2, - __global T* channel_dot) { + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { int index = get_global_id(0); if(index < num * spatial_dim) { int n = index / spatial_dim; @@ -157,15 +157,15 @@ __kernel void kernel_channel_dot(const int num, const int channels, T dot = 0; for (int c = 0; c < channels; ++c) { dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); + * data_2[(n * channels + c) * spatial_dim + s]); } channel_dot[index] = dot; } } template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const float* data_1, __global const float* data_2, - __global float* channel_dot); + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const double* data_1, __global const double* data_2, - __global double* channel_dot); + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index 025f59ac..70c282e1 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -26,10 +26,10 @@ template __kernel void SoftmaxLossForwardGPU(const int nthreads, - __global T* prob_data, __global T* label,__global T* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global T* counts) { + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { int index = get_global_id(0); if(index < nthreads) { const int n = index / spatial_dim; @@ -40,28 +40,28 @@ __kernel void SoftmaxLossForwardGPU(const int nthreads, counts[index] = 0; } else { loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - T(FLT_MIN))); + T(FLT_MIN))); counts[index] = 1; } } } template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global float* prob_data, __global float* label,__global float* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global float* counts); + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global double* prob_data, __global double* label,__global double* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global double* counts); + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); template __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, - __global T* label,__global T* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, T* counts) { + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { const int channels = dim / spatial_dim; int index = get_global_id(0); if(index < nthreads) { @@ -81,14 +81,14 @@ __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, } } template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, - __global float* label,__global float* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, float* counts); + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, - __global double* label,__global double* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, double* counts); + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); template __kernel void scal (const int num, const T alpha, __global T* data) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index ae675500..0a07a218 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -13,13 +13,14 @@ #include "caffe/util/ocl_wrapper.hpp" namespace caffe { -template +template Solver::Solver(const SolverParameter& param) - : net_() { +: + net_() { Init(param); } -template +template void Solver::ocl_setup() { scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); @@ -27,18 +28,19 @@ void Solver::ocl_setup() { powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } -template +template Solver::Solver(const string& param_file) - : net_() { +: + net_() { SolverParameter param; ReadProtoFromTextFileOrDie(param_file, ¶m); Init(param); } -template +template void Solver::Init(const SolverParameter& param) { LOG(INFO) << "Initializing solver from parameters: " << std::endl - << param.DebugString(); + << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; @@ -55,22 +57,22 @@ void Solver::Init(const SolverParameter& param) { current_step_ = 0; } -template +template void Solver::InitTrainNet() { const int num_train_nets = param_.has_net() + param_.has_net_param() + - param_.has_train_net() + param_.has_train_net_param(); + param_.has_train_net() + param_.has_train_net_param(); const string& field_names = "net, net_param, train_net, train_net_param"; CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " - << "using one of these fields: " << field_names; + << "using one of these fields: " << field_names; CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " - << "one of these fields specifying a train_net: " << field_names; + << "one of these fields specifying a train_net: " << field_names; NetParameter net_param; if (param_.has_train_net_param()) { LOG(INFO) << "Creating training net specified in train_net_param."; net_param.CopyFrom(param_.train_net_param()); } else if (param_.has_train_net()) { LOG(INFO) << "Creating training net from train_net file: " - << param_.train_net(); + << param_.train_net(); ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); } if (param_.has_net_param()) { @@ -93,22 +95,22 @@ void Solver::InitTrainNet() { net_.reset(new Net(net_param)); } -template +template void Solver::InitTestNets() { const bool has_net_param = param_.has_net_param(); const bool has_net_file = param_.has_net(); const int num_generic_nets = has_net_param + has_net_file; CHECK_LE(num_generic_nets, 1) - << "Both net_param and net_file may not be specified."; + << "Both net_param and net_file may not be specified."; const int num_test_net_params = param_.test_net_param_size(); const int num_test_net_files = param_.test_net_size(); const int num_test_nets = num_test_net_params + num_test_net_files; if (num_generic_nets) { CHECK_GE(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; + << "test_iter must be specified for each test network."; } else { CHECK_EQ(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; + << "test_iter must be specified for each test network."; } // If we have a generic net (specified by net or net_param, rather than // test_net or test_net_param), we may have an unlimited number of actual @@ -119,7 +121,7 @@ void Solver::InitTestNets() { const int num_test_net_instances = num_test_nets + num_generic_net_instances; if (param_.test_state_size()) { CHECK_EQ(param_.test_state_size(), num_test_net_instances) - << "test_state must be unspecified or specified once per test net."; + << "test_state must be unspecified or specified once per test net."; } if (num_test_net_instances) { CHECK_GT(param_.test_interval(), 0); @@ -134,7 +136,7 @@ void Solver::InitTestNets() { for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { sources[test_net_id] = "test_net file: " + param_.test_net(i); ReadNetParamsFromTextFileOrDie(param_.test_net(i), - &net_params[test_net_id]); + &net_params[test_net_id]); } const int remaining_test_nets = param_.test_iter_size() - test_net_id; if (has_net_param) { @@ -163,13 +165,13 @@ void Solver::InitTestNets() { } net_params[i].mutable_state()->CopyFrom(net_state); LOG(INFO) - << "Creating test net (#" << i << ") specified by " << sources[i]; + << "Creating test net (#" << i << ") specified by " << sources[i]; test_nets_[i].reset(new Net(net_params[i])); test_nets_[i]->set_debug_info(param_.debug_info()); } } -template +template void Solver::Step(int iters) { vector*> bottom_vec; const int start_iter = iter_; @@ -185,19 +187,19 @@ void Solver::Step(int iters) { switch (Caffe::mode()) { case Caffe::CPU: caffe_set(blob->count(), static_cast(0), - blob->mutable_cpu_diff()); + blob->mutable_cpu_diff()); break; case Caffe::GPU: #ifndef CPU_ONLY caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + blob->mutable_gpu_diff()); #else NO_GPU; #endif case Caffe::APU: #ifndef CPU_ONLY caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + blob->mutable_gpu_diff()); #else NO_GPU; #endif @@ -206,7 +208,7 @@ void Solver::Step(int iters) { } if (param_.test_interval() && iter_ % param_.test_interval() == 0 - && (iter_ > 0 || param_.test_initialization())) { + && (iter_ > 0 || param_.test_initialization())) { TestAll(); } @@ -228,10 +230,10 @@ void Solver::Step(int iters) { smoothed_loss += (loss - losses[idx]) / average_loss; losses[idx] = loss; printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, - losses[idx], idx); + losses[idx], idx); } printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", - smoothed_loss, average_loss, losses.size()); + smoothed_loss, average_loss, losses.size()); if (display) { LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); @@ -239,18 +241,18 @@ void Solver::Step(int iters) { for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); const string& output_name = - net_->blob_names()[net_->output_blob_indices()[j]]; + net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = - net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + net_->blob_loss_weights()[net_->output_blob_indices()[j]]; for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * result_vec[k] << " loss)"; + << " = " << loss_weight * result_vec[k] << " loss)"; } LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); + << score_index++ << ": " << output_name << " = " + << result_vec[k] << loss_msg_stream.str(); } } } @@ -267,7 +269,7 @@ void Solver::Step(int iters) { } } -template +template void Solver::Solve(const char* resume_file) { LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); @@ -283,7 +285,7 @@ void Solver::Solve(const char* resume_file) { // If we haven't already, save a snapshot after optimization, unless // overridden by setting snapshot_after_train := false if (param_.snapshot_after_train() - && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { + && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { Snapshot(); } // After the optimization is done, run an additional train and test pass to @@ -303,19 +305,19 @@ void Solver::Solve(const char* resume_file) { LOG(INFO) << "Optimization Done."; } -template +template void Solver::TestAll() { for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { Test(test_net_id); } } -template +template void Solver::Test(const int test_net_id) { LOG(INFO) << "Iteration " << iter_ - << ", Testing net (#" << test_net_id << ")"; + << ", Testing net (#" << test_net_id << ")"; CHECK_NOTNULL(test_nets_[test_net_id].get())-> - ShareTrainedLayersWith(net_.get()); + ShareTrainedLayersWith(net_.get()); vector < Dtype > test_score; vector test_score_output_id; vector*> bottom_vec; @@ -324,7 +326,7 @@ void Solver::Test(const int test_net_id) { for (int i = 0; i < param_.test_iter(test_net_id); ++i) { Dtype iter_loss; const vector*>& result = - test_net->Forward(bottom_vec, &iter_loss); + test_net->Forward(bottom_vec, &iter_loss); if (param_.test_compute_loss()) { loss += iter_loss; } @@ -352,21 +354,21 @@ void Solver::Test(const int test_net_id) { } for (int i = 0; i < test_score.size(); ++i) { const int output_blob_index = - test_net->output_blob_indices()[test_score_output_id[i]]; + test_net->output_blob_indices()[test_score_output_id[i]]; const string& output_name = test_net->blob_names()[output_blob_index]; const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); if (loss_weight) { loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * mean_score << " loss)"; + << " = " << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " - << mean_score << loss_msg_stream.str(); + << mean_score << loss_msg_stream.str(); } } -template +template void Solver::Snapshot() { NetParameter net_param; // For intermediate results, we will also dump the gradient values. @@ -390,7 +392,7 @@ void Solver::Snapshot() { WriteProtoToBinaryFile(state, snapshot_filename.c_str()); } -template +template void Solver::Restore(const char* state_file) { SolverState state; NetParameter net_param; @@ -419,7 +421,7 @@ void Solver::Restore(const char* state_file) { // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. -template +template Dtype SGDSolver::GetLearningRate() { Dtype rate; const string& lr_policy = this->param_.lr_policy(); @@ -428,37 +430,37 @@ Dtype SGDSolver::GetLearningRate() { } else if (lr_policy == "step") { this->current_step_ = this->iter_ / this->param_.stepsize(); rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); + pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "exp") { rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); } else if (lr_policy == "inv") { rate = this->param_.base_lr() * - pow(Dtype(1) + this->param_.gamma() * this->iter_, - -this->param_.power()); + pow(Dtype(1) + this->param_.gamma() * this->iter_, + -this->param_.power()); } else if (lr_policy == "multistep") { if (this->current_step_ < this->param_.stepvalue_size() && - this->iter_ >= this->param_.stepvalue(this->current_step_)) { + this->iter_ >= this->param_.stepvalue(this->current_step_)) { this->current_step_++; LOG(INFO) << "MultiStep Status: Iteration " << - this->iter_ << ", step = " << this->current_step_; + this->iter_ << ", step = " << this->current_step_; } rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); + pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "poly") { rate = this->param_.base_lr() * pow(Dtype(1.) - - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - this->param_.power()); + (Dtype(this->iter_) / Dtype(this->param_.max_iter())), + this->param_.power()); } else if (lr_policy == "sigmoid") { rate = this->param_.base_lr() * (Dtype(1.) / - (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - Dtype(this->param_.stepsize()))))); + (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - + Dtype(this->param_.stepsize()))))); } else { LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; } return rate; } -template +template void SGDSolver::PreSolve() { // Initialize the history const vector > >& net_params = this->net_->params(); @@ -473,7 +475,7 @@ void SGDSolver::PreSolve() { } } -template +template void SGDSolver::ClipGradients() { const Dtype clip_gradients = this->param_.clip_gradients(); if (clip_gradients < 0) { @@ -490,8 +492,8 @@ void SGDSolver::ClipGradients() { if (l2norm_diff > clip_gradients) { Dtype scale_factor = clip_gradients / l2norm_diff; LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " - << l2norm_diff << " > " << clip_gradients << ") " - << "by scale factor " << scale_factor; + << l2norm_diff << " > " << clip_gradients << ") " + << "by scale factor " << scale_factor; for (int i = 0; i < net_params.size(); ++i) { if (this->net_->param_owners()[i] < 0) { net_params[i]->scale_diff(scale_factor); @@ -500,7 +502,7 @@ void SGDSolver::ClipGradients() { } } -template +template void SGDSolver::ApplyUpdate() { Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { @@ -515,7 +517,7 @@ void SGDSolver::ApplyUpdate() { this->net_->Update(); } -template +template void SGDSolver::Normalize(int param_id) { if (this->param_.iter_size() == 1) { return; @@ -526,13 +528,13 @@ void SGDSolver::Normalize(int param_id) { switch (Caffe::mode()) { case Caffe::CPU: { caffe_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_cpu_diff()); + net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_gpu_diff()); + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif @@ -543,11 +545,11 @@ void SGDSolver::Normalize(int param_id) { } } -template +template void SGDSolver::Regularize(int param_id) { const vector > >& net_params = this->net_->params(); const vector& net_params_weight_decay = - this->net_->params_weight_decay(); + this->net_->params_weight_decay(); Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; @@ -558,17 +560,17 @@ void SGDSolver::Regularize(int param_id) { if (regularization_type == "L2") { // add weight decay caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); } else if (regularization_type == "L1") { caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - temp_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); caffe_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); + local_decay, + temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } @@ -581,17 +583,17 @@ void SGDSolver::Regularize(int param_id) { if (regularization_type == "L2") { // add weight decay caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); } else if (regularization_type == "L1") { caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - temp_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_data(), + temp_[param_id]->mutable_gpu_data()); caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + local_decay, + temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } @@ -606,7 +608,7 @@ void SGDSolver::Regularize(int param_id) { } } -template +template void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); @@ -616,21 +618,21 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { switch (Caffe::mode()) { case Caffe::CPU: { caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), momentum, + history_[param_id]->mutable_gpu_data()); caffe_gpu_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + history_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif @@ -641,7 +643,7 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { } } -template +template void SGDSolver::SnapshotSolverState(SolverState* state) { state->clear_history(); for (int i = 0; i < history_.size(); ++i) { @@ -651,17 +653,17 @@ void SGDSolver::SnapshotSolverState(SolverState* state) { } } -template +template void SGDSolver::RestoreSolverState(const SolverState& state) { CHECK_EQ(state.history_size(), history_.size()) - << "Incorrect length of history blobs."; + << "Incorrect length of history blobs."; LOG(INFO) << "SGDSolver: restoring history"; for (int i = 0; i < history_.size(); ++i) { history_[i]->FromProto(state.history(i)); } } -template +template void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); @@ -671,46 +673,46 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::CPU: { // save history momentum for stepping back caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); + this->history_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); // update history caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_diff(), momentum, + this->history_[param_id]->mutable_cpu_data()); // compute update: step back then over step caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->cpu_data(), -momentum, - this->update_[param_id]->mutable_cpu_data()); + this->history_[param_id]->cpu_data(), -momentum, + this->update_[param_id]->mutable_cpu_data()); // copy caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); + this->update_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // save history momentum for stepping back caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); + this->history_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - this->history_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), momentum, + this->history_[param_id]->mutable_gpu_data()); // compute update: step back then over step caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->gpu_data(), -momentum, - this->update_[param_id]->mutable_gpu_data()); + this->history_[param_id]->gpu_data(), -momentum, + this->update_[param_id]->mutable_gpu_data()); // copy caffe_gpu_copy(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + this->update_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif @@ -721,7 +723,7 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { } } -template +template void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); @@ -731,64 +733,64 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::CPU: { // compute square of gradient in update caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); // update history caffe_add(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - this->history_[param_id]->cpu_data(), - this->history_[param_id]->mutable_cpu_data()); + this->update_[param_id]->cpu_data(), + this->history_[param_id]->cpu_data(), + this->history_[param_id]->mutable_cpu_data()); // prepare update caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); + delta, this->update_[param_id]->mutable_cpu_data()); caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->cpu_data(), Dtype(0), - net_params[param_id]->mutable_cpu_diff()); + this->update_[param_id]->cpu_data(), Dtype(0), + net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_add(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - this->history_[param_id]->gpu_data(), - this->history_[param_id]->mutable_gpu_data()); + this->update_[param_id]->gpu_data(), + this->history_[param_id]->gpu_data(), + this->history_[param_id]->mutable_gpu_data()); // prepare update caffe_gpu_powx(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); + delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), + this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); // scale and copy caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->gpu_data(), Dtype(0), - net_params[param_id]->mutable_gpu_diff()); + this->update_[param_id]->gpu_data(), Dtype(0), + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 94d62e0e..67f5984b 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -38,8 +38,8 @@ namespace caffe { SyncedMemory::~SyncedMemory() { if (cpu_ptr_ && own_cpu_data_) { OCL_CHECK( - clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, - cpu_ptr_, 0, NULL, NULL)); + clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + cpu_ptr_, 0, NULL, NULL)); clFinish(amdDevice.CommandQueue); } if (gpu_cache_ptr_ && own_cpu_data_) { @@ -62,11 +62,12 @@ inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, - size_, NULL, NULL); + size_, NULL, NULL); //} cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, - (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, - 0, NULL, NULL, NULL); + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_, + 0, NULL, NULL, NULL); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; @@ -75,15 +76,15 @@ inline void SyncedMemory::to_cpu() { #ifndef CPU_ONLY if (cpu_ptr_ == NULL) { gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, - CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); + CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, - (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, - size_, 0, NULL, NULL, NULL); + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_, 0, NULL, NULL, NULL); own_cpu_data_ = true; } OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, - (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, + (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); clFinish(amdDevice.CommandQueue); head_ = SYNCED; #else @@ -105,7 +106,7 @@ inline void SyncedMemory::to_gpu() { switch (head_) { case UNINITIALIZED: { cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - size_, NULL, NULL); + size_, NULL, NULL); if (NULL == tmpMem) { fprintf(stderr, "Failed to create memory object\n"); break; @@ -118,15 +119,15 @@ inline void SyncedMemory::to_gpu() { case HEAD_AT_CPU: { if (gpu_ptr_ == NULL) { cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - size_, NULL, NULL); + size_, NULL, NULL); if (NULL == tmpMem) { fprintf(stderr, "Failed to create memory object\n"); } gpu_ptr_ = (void*) tmpMem; } OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, - (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); clFinish(amdDevice.CommandQueue); head_ = SYNCED; #ifdef Track_data_transfer diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 7d0a85aa..4c0ce04e 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -6,9 +6,10 @@ namespace caffe { Timer::Timer() - : initted_(false), - running_(false), - has_run_at_least_once_(false) { +: + initted_(false), + running_(false), + has_run_at_least_once_(false) { Init(); } @@ -98,7 +99,7 @@ float CPUTimer::MilliSeconds() { Stop(); } this->elapsed_milliseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_milliseconds(); + this->start_cpu_).total_milliseconds(); return this->elapsed_milliseconds_; } @@ -111,7 +112,7 @@ float CPUTimer::MicroSeconds() { Stop(); } this->elapsed_microseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_microseconds(); + this->start_cpu_).total_microseconds(); return this->elapsed_microseconds_; } diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp index aec747af..d8adce8a 100644 --- a/src/caffe/util/db_leveldb.cpp +++ b/src/caffe/util/db_leveldb.cpp @@ -14,7 +14,7 @@ void LevelDB::Open(const string& source, Mode mode) { options.create_if_missing = mode != READ; leveldb::Status status = leveldb::DB::Open(options, source, &db_); CHECK(status.ok()) << "Failed to open leveldb " << source - << std::endl << status.ToString(); + << std::endl << status.ToString(); LOG(INFO) << "Opened leveldb " << source; } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 69cc47bc..886ac85b 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -34,14 +34,14 @@ namespace caffe { -template extern std::string get_dtype_suffix(); +template extern std::string get_dtype_suffix(); -template +template void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; @@ -55,7 +55,7 @@ void im2col_cpu(const Dtype* data_im, const int channels, int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; + data_im[(c_im * height + h_pad) * width + w_pad]; else data_col[(c * height_col + h) * width_col + w] = 0; } @@ -64,20 +64,20 @@ void im2col_cpu(const Dtype* data_im, const int channels, } template void im2col_cpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_col); template void im2col_cpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_col); -template +template void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im) { + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im) { caffe_set(height * width * channels, Dtype(0), data_im); int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; @@ -92,26 +92,26 @@ void col2im_cpu(const Dtype* data_col, const int channels, int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_im[(c_im * height + h_pad) * width + w_pad] += - data_col[(c * height_col + h) * width_col + w]; + data_col[(c * height_col + h) * width_col + w]; } } } } template void col2im_cpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_cpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); -template +template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum) { + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset, int optnum) { std::string kernel_name = "col2im_opt" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad - ksize) / stride + 1; @@ -138,26 +138,26 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu_opt(const float* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_im, const int img_offset, int optnum); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_im, const int img_offset, int optnum); template void col2im_gpu_opt(const double* data_col, - const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_im, const int img_offset, int optnum); + const int col_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_im, const int img_offset, int optnum); -template +template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset) - { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset) + { std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -186,30 +186,30 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void im2col_gpu(const float* data_im, const int img_offset, - const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col, const int col_offset); + const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col, const int col_offset); template void im2col_gpu(const double* data_im, const int img_offset, - const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col, const int col_offset); + const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col, const int col_offset); -template +template void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset) - { + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset) + { std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -238,25 +238,26 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu(const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, float* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, float* data_im, + const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im, const int img_offset); -template +template void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset) { + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset) { int height_col = (height + 2 * pad - ksize) / stride + 1; int width_col = (width + 2 * pad - ksize) / stride + 1; @@ -279,25 +280,25 @@ void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); clFinish(amdDevice.CommandQueue); } template void im2col_gpu(cl_kernel Kernel, const float* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset); + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset); template void im2col_gpu(cl_kernel Kernel, const double* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset); + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset); -template +template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum) { + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col, const int col_offset, int optnum) { std::string kernel_name = "im2col_opt" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -326,23 +327,23 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void im2col_gpu_opt(const float* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset, int optnum); + const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col, const int col_offset, int optnum); template void im2col_gpu_opt(const double* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset, int optnum); + const int img_offset, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col, const int col_offset, int optnum); -template +template void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset) { + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset) { std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -369,17 +370,17 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu(const float* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im, const int img_offset); + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, float* data_im, const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im, const int img_offset); + const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, double* data_im, const int img_offset); } // namespace caffe diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index be0ce3b4..6435427e 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -10,11 +10,11 @@ namespace caffe { template __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_col) { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_col) { CUDA_KERNEL_LOOP(index, n) { int w_out = index % width_col; int h_index = index / width_col; @@ -39,12 +39,12 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, } } -template +template void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; @@ -53,30 +53,30 @@ void im2col_gpu(const Dtype* data_im, const int channels, // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<<>>( - num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, - pad_w, stride_h, stride_w, height_col, - width_col, data_col); + num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, + pad_w, stride_h, stride_w, height_col, + width_col, data_col); CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void im2col_gpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_gpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_im) { + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { Dtype val = 0; int w = index % width + pad_w; @@ -101,11 +101,11 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, } } -template +template void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im) { + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im) { int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; @@ -114,20 +114,20 @@ void col2im_gpu(const Dtype* data_col, const int channels, // NOLINT_NEXT_LINE(whitespace/operators) col2im_gpu_kernel<<>>( - num_kernels, data_col, height, width, channels, patch_h, patch_w, - pad_h, pad_w, stride_h, stride_w, - height_col, width_col, data_im); + num_kernels, data_col, height, width, channels, patch_h, patch_w, + pad_h, pad_w, stride_h, stride_w, + height_col, width_col, data_im); CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void col2im_gpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_gpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); } // namespace caffe diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp index 2fbad3a9..299d1fd0 100644 --- a/src/caffe/util/insert_splits.cpp +++ b/src/caffe/util/insert_splits.cpp @@ -31,7 +31,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { for (int j = 0; j < layer_param.bottom_size(); ++j) { const string& blob_name = layer_param.bottom(j); if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { + blob_name_to_last_top_idx.end()) { LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; } const pair& bottom_idx = make_pair(i, j); @@ -46,7 +46,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { // A use of a top blob as a loss should be handled similarly to the use of // a top blob as an input (bottom) blob to another layer. const int last_loss = - std::min(layer_param.loss_weight_size(), layer_param.top_size()); + std::min(layer_param.loss_weight_size(), layer_param.top_size()); for (int j = 0; j < last_loss; ++j) { const string& blob_name = layer_param.top(j); const pair& top_idx = blob_name_to_last_top_idx[blob_name]; @@ -66,7 +66,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { LayerParameter* split_layer_param = param_split->add_layer(); const float kZeroLossWeight = 0; ConfigureSplitLayer(layer_name, blob_name, i, split_count, - kZeroLossWeight, split_layer_param); + kZeroLossWeight, split_layer_param); } } for (int i = 0; i < param.layer_size(); ++i) { @@ -75,13 +75,13 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { // Replace any shared bottom blobs with split layer outputs. for (int j = 0; j < layer_param->bottom_size(); ++j) { const pair& top_idx = - bottom_idx_to_source_top_idx[make_pair(i, j)]; + bottom_idx_to_source_top_idx[make_pair(i, j)]; const int split_count = top_idx_to_bottom_count[top_idx]; if (split_count > 1) { const string& layer_name = layer_idx_to_layer_name[top_idx.first]; const string& blob_name = layer_param->bottom(j); layer_param->set_bottom(j, SplitBlobName(layer_name, - blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); + blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); } } // Create split layer for any top blobs used by other layer as bottom @@ -95,7 +95,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { LayerParameter* split_layer_param = param_split->add_layer(); const float loss_weight = top_idx_to_loss_weight[top_idx]; ConfigureSplitLayer(layer_name, blob_name, j, split_count, - loss_weight, split_layer_param); + loss_weight, split_layer_param); if (loss_weight) { layer_param->clear_loss_weight(); top_idx_to_bottom_split_idx[top_idx]++; @@ -106,15 +106,15 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { } void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param) { + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param) { split_layer_param->Clear(); split_layer_param->add_bottom(blob_name); split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx)); split_layer_param->set_type("Split"); for (int k = 0; k < split_count; ++k) { split_layer_param->add_top( - SplitBlobName(layer_name, blob_name, blob_idx, k)); + SplitBlobName(layer_name, blob_name, blob_idx, k)); if (loss_weight) { if (k == 0) { split_layer_param->add_loss_weight(loss_weight); @@ -126,18 +126,18 @@ void ConfigureSplitLayer(const string& layer_name, const string& blob_name, } string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx) { + const int blob_idx) { ostringstream split_layer_name; split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split"; + << "_split"; return split_layer_name.str(); } string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx) { + const int blob_idx, const int split_idx) { ostringstream split_blob_name; split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split_" << split_idx; + << "_split_" << split_idx; return split_blob_name.str(); } diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index c3be8a76..63dcf312 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -68,7 +68,7 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) { } cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color) { + const int height, const int width, const bool is_color) { cv::Mat cv_img; int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); @@ -86,12 +86,12 @@ cv::Mat ReadImageToCVMat(const string& filename, } cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width) { + const int height, const int width) { return ReadImageToCVMat(filename, height, width, true); } cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color) { + const bool is_color) { return ReadImageToCVMat(filename, 0, 0, is_color); } @@ -100,7 +100,7 @@ cv::Mat ReadImageToCVMat(const string& filename) { } // Do the file extension and encoding match? static bool matchExt(const std::string & fn, - std::string en) { + std::string en) { size_t p = fn.rfind('.'); std::string ext = p != fn.npos ? fn.substr(p) : fn; std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); @@ -112,18 +112,18 @@ static bool matchExt(const std::string & fn, return false; } bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum) { + const int height, const int width, const bool is_color, + const std::string & encoding, Datum* datum) { cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); if (cv_img.data) { if (encoding.size()) { if ((cv_img.channels() == 3) == is_color && !height && !width && - matchExt(filename, encoding)) + matchExt(filename, encoding)) return ReadFileToDatum(filename, label, datum); std::vector < uchar > buf; cv::imencode("." + encoding, cv_img, buf); datum->set_data(std::string(reinterpret_cast(&buf[0]), - buf.size())); + buf.size())); datum->set_label(label); datum->set_encoded(true); return true; @@ -137,7 +137,7 @@ bool ReadImageToDatum(const string& filename, const int label, } bool ReadFileToDatum(const string& filename, const int label, - Datum* datum) { + Datum* datum) { std::streampos size; fstream file(filename.c_str(), ios::in | ios::binary | ios::ate); @@ -229,13 +229,13 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { } // Verifies format of data stored in HDF5 file and reshapes blob accordingly. -template +template void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob) { + hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, + Blob* blob) { // Verify that the dataset exists. CHECK(H5LTfind_dataset(file_id, dataset_name_)) - << "Failed to find HDF5 dataset " << dataset_name_; + << "Failed to find HDF5 dataset " << dataset_name_; // Verify that the number of dimensions is in the accepted range. herr_t status; int ndims; @@ -248,7 +248,7 @@ void hdf5_load_nd_dataset_helper( std::vector < hsize_t > dims(ndims); H5T_class_t class_; status = H5LTget_dataset_info( - file_id, dataset_name_, dims.data(), &class_, NULL); + file_id, dataset_name_, dims.data(), &class_, NULL); CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; @@ -259,47 +259,47 @@ void hdf5_load_nd_dataset_helper( blob->Reshape(blob_dims); } -template<> +template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { + int min_dim, int max_dim, Blob* blob) { hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); herr_t status = H5LTread_dataset_float( - file_id, dataset_name_, blob->mutable_cpu_data()); + file_id, dataset_name_, blob->mutable_cpu_data()); CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; } -template<> +template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { + int min_dim, int max_dim, Blob* blob) { hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); herr_t status = H5LTread_dataset_double( - file_id, dataset_name_, blob->mutable_cpu_data()); + file_id, dataset_name_, blob->mutable_cpu_data()); CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; } -template<> +template <> void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { + const hid_t file_id, const string& dataset_name, const Blob& blob) { hsize_t dims[HDF5_NUM_DIMS]; dims[0] = blob.num(); dims[1] = blob.channels(); dims[2] = blob.height(); dims[3] = blob.width(); herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; } -template<> +template <> void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { + const hid_t file_id, const string& dataset_name, const Blob& blob) { hsize_t dims[HDF5_NUM_DIMS]; dims[0] = blob.num(); dims[1] = blob.channels(); dims[2] = blob.height(); dims[3] = blob.width(); herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 61162be6..4d2c9de6 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -40,265 +40,271 @@ static const clblasOrder order = clblasColumnMajor; namespace caffe { -template<> +template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + ldb, beta, C, N); } -template<> +template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + ldb, beta, C, N); } -template<> +template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0, - ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + 0, + ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0, - ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + 0, + ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const int offA, const float* B, - const int offB, const float beta, float* C, const int offC) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { cl_event event; clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, - offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, + offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); return event; } -template<> +template <> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const int offA, const double* B, - const int offB, const double beta, double* C, const int offC) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { cl_event event; clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, - offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, + offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); return event; } -template<> +template <> cl_event caffe_gpu_gemm(cl_command_queue *queue, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const int offA, const float* B, - const int offB, const float beta, float* C, const int offC) { + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { cl_event event; clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, - offC, ldc, 1, queue, 0, NULL, &event)); + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, + offC, ldc, 1, queue, 0, NULL, &event)); return event; } -template<> +template <> cl_event caffe_gpu_gemm(cl_command_queue *queue, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const int offA, const double* B, - const int offB, const double beta, double* C, const int offC) { + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { cl_event event; clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C, - offC, ldc, 1, queue, 0, NULL, &event)); + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, + offC, ldc, 1, queue, 0, NULL, &event)); return event; } -template<> +template <> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -template<> +template <> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, size_t offA, int lda, - const float* x, size_t offx, const float beta, int incx, - float* y, size_t offy, int incy) { + const int N, const float alpha, const float* A, size_t offA, int lda, + const float* x, size_t offx, const float beta, int incx, + float* y, size_t offy, int incy) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, - M, N, (cl_float) alpha, (cl_mem) A, offA, lda, - (cl_mem) x, offx, incx, (cl_float) beta, - (cl_mem) y, offy, incy, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + M, N, (cl_float) alpha, (cl_mem) A, offA, lda, + (cl_mem) x, offx, incx, (cl_float) beta, + (cl_mem) y, offy, incy, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, size_t offA, int lda, - const double* x, size_t offx, const double beta, int incx, - double* y, size_t offy, int incy) { + const int N, const double alpha, const double* A, size_t offA, int lda, + const double* x, size_t offx, const double beta, int incx, + double* y, size_t offy, int incy) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; CLBLAS_CHECK( - clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, - offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, - incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, - M, N, (cl_float) alpha, (cl_mem) A, 0, N, - (cl_mem) x, 0, 1, (cl_float) beta, - (cl_mem) y, 0, 1, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + M, N, (cl_float) alpha, (cl_mem) A, 0, N, + (cl_mem) x, 0, 1, (cl_float) beta, + (cl_mem) y, 0, 1, + 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; CLBLAS_CHECK( - clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, - N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_axpy(const int N, const float alpha, const float* X, - float* Y) { + float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } -template<> +template <> void caffe_axpy(const int N, const double alpha, const double* X, - double* Y) { + double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } -template<> +template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { + float* Y) { CLBLAS_CHECK( - clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { + double* Y) { CLBLAS_CHECK( - clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_sgnbit(const int n, const float* x, float* y) - { + { } -template<> +template <> void caffe_gpu_sgnbit(const int n, const double* x, double* y) - { + { } -template<> +template <> void caffe_gpu_abs(const int n, const float* x, float* y) - { + { caffe_gpu_abs_ocl(n, x, y); } -template<> +template <> void caffe_gpu_abs(const int n, const double* x, double* y) - { + { caffe_gpu_abs_ocl(n, x, y); } -template<> +template <> void caffe_set(const int N, const float alpha, float* Y) { if (alpha == 0) { memset(Y, 0, sizeof(float) * N); @@ -309,7 +315,7 @@ void caffe_set(const int N, const float alpha, float* Y) { } } -template<> +template <> void caffe_set(const int N, const double alpha, double* Y) { if (alpha == 0) { memset(Y, 0, sizeof(double) * N); @@ -320,35 +326,35 @@ void caffe_set(const int N, const double alpha, double* Y) { } } -template<> +template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { for (int i = 0; i < N; ++i) { Y[i] += alpha; } } -template<> +template <> void caffe_add_scalar(const int N, const double alpha, double* Y) { for (int i = 0; i < N; ++i) { Y[i] += alpha; } } -template<> +template <> void caffe_copy(const int N, const float* X, float* Y) { cblas_scopy(N, X, 1, Y, 1); } -template<> +template <> void caffe_copy(const int N, const double* X, double* Y) { cblas_dcopy(N, X, 1, Y, 1); } //template void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) - { + { clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, - NULL, NULL); + NULL, NULL); // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); } /* @@ -357,166 +363,168 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); */ -template<> +template <> void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) - { + { OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N, - 0, NULL, NULL)); + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, + 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) - { + { OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N, - 0, NULL, NULL)); + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, + 0, NULL, NULL)); } -template<> +template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { if (X != Y) { CLBLAS_CHECK( - clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } } -template<> +template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { if (X != Y) { CLBLAS_CHECK( - clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } } -template<> +template <> void caffe_scal(const int N, const float alpha, float *X) { cblas_sscal(N, alpha, X, 1); } -template<> +template <> void caffe_scal(const int N, const double alpha, double *X) { cblas_dscal(N, alpha, X, 1); } -template<> +template <> void caffe_gpu_scal(const int N, const float alpha, float *X) { CLBLAS_CHECK( - clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); + clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } -template<> +template <> void caffe_gpu_scal(const int N, const double alpha, double *X) { CLBLAS_CHECK( - clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); + clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } -template<> +template <> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { + const float beta, float* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } -template<> +template <> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { + const double beta, double* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } -template<> +template <> void caffe_cpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { + const float beta, float* Y) { cblas_saxpby(N, alpha, X, 1, beta, Y, 1); } -template<> +template <> void caffe_cpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { + const double beta, double* Y) { cblas_daxpby(N, alpha, X, 1, beta, Y, 1); } -template<> +template <> void caffe_add(const int n, const float* a, const float* b, - float* y) { + float* y) { vsAdd(n, a, b, y); } -template<> +template <> void caffe_add(const int n, const double* a, const double* b, - double* y) { + double* y) { vdAdd(n, a, b, y); } -template<> +template <> void caffe_sub(const int n, const float* a, const float* b, - float* y) { + float* y) { vsSub(n, a, b, y); } -template<> +template <> void caffe_sub(const int n, const double* a, const double* b, - double* y) { + double* y) { vdSub(n, a, b, y); } -template<> +template <> void caffe_mul(const int n, const float* a, const float* b, - float* y) { + float* y) { vsMul(n, a, b, y); } -template<> +template <> void caffe_mul(const int n, const double* a, const double* b, - double* y) { + double* y) { vdMul(n, a, b, y); } -template<> +template <> void caffe_div(const int n, const float* a, const float* b, - float* y) { + float* y) { vsDiv(n, a, b, y); } -template<> +template <> void caffe_div(const int n, const double* a, const double* b, - double* y) { + double* y) { vdDiv(n, a, b, y); } -template<> +template <> void caffe_powx(const int n, const float* a, const float b, - float* y) { + float* y) { vsPowx(n, a, b, y); } -template<> +template <> void caffe_powx(const int n, const double* a, const double b, - double* y) { + double* y) { vdPowx(n, a, b, y); } -template<> +template <> void caffe_sqr(const int n, const float* a, float* y) { vsSqr(n, a, y); } -template<> +template <> void caffe_sqr(const int n, const double* a, double* y) { vdSqr(n, a, y); } -template<> +template <> void caffe_exp(const int n, const float* a, float* y) { vsExp(n, a, y); } -template<> +template <> void caffe_exp(const int n, const double* a, double* y) { vdExp(n, a, y); } @@ -525,10 +533,10 @@ unsigned int caffe_rng_rand() { return (*caffe_rng())(); } -template +template Dtype caffe_nextafter(const Dtype b) { return boost::math::nextafter < Dtype > ( - b, std::numeric_limits < Dtype > ::max()); + b, std::numeric_limits < Dtype > ::max()); } template @@ -537,13 +545,13 @@ float caffe_nextafter(const float b); template double caffe_nextafter(const double b); -template +template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { CHECK_GE(n, 0); CHECK(r); CHECK_LE(a, b); boost::uniform_real < Dtype - > random_distribution(a, caffe_nextafter(b)); + > random_distribution(a, caffe_nextafter(b)); boost::variate_generator > variate_generator(caffe_rng(), random_distribution); for (int i = 0; i < n; ++i) { @@ -555,15 +563,15 @@ void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { template void caffe_rng_uniform(const int n, const float a, const float b, - float* r); + float* r); template void caffe_rng_uniform(const int n, const double a, const double b, - double* r); + double* r); -template +template void caffe_rng_gaussian(const int n, const Dtype a, - const Dtype sigma, Dtype* r) { + const Dtype sigma, Dtype* r) { CHECK_GE(n, 0); CHECK(r); CHECK_GT(sigma, 0); @@ -579,13 +587,13 @@ void caffe_rng_gaussian(const int n, const Dtype a, template void caffe_rng_gaussian(const int n, const float mu, - const float sigma, float* r); + const float sigma, float* r); template void caffe_rng_gaussian(const int n, const double mu, - const double sigma, double* r); + const double sigma, double* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { CHECK_GE(n, 0); CHECK(r); @@ -605,7 +613,7 @@ void caffe_rng_bernoulli(const int n, const double p, int* r); template void caffe_rng_bernoulli(const int n, const float p, int* r); -template +template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { CHECK_GE(n, 0); CHECK(r); @@ -625,104 +633,104 @@ void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); template void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); // -template<> +template <> float caffe_cpu_dot(const int n, const float* x, const float* y) { return cblas_sdot(n, x, 1, y, 1); } -template<> +template <> double caffe_cpu_dot(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } -template<> +template <> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { + float* out) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(float)), NULL, NULL); + (n * sizeof(float)), NULL, NULL); cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(float)), NULL, NULL); + (1 * sizeof(float)), NULL, NULL); clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); + &(amdDevice.CommandQueue), 0, NULL, NULL); clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), - out, 0, NULL, NULL); + out, 0, NULL, NULL); clReleaseMemObject(scratchBuff); clReleaseMemObject(d_out); } -template<> +template <> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { + double * out) { //need to pass in scratchBuff //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(double)), NULL, NULL); + (n * sizeof(double)), NULL, NULL); cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(double)), NULL, NULL); + (1 * sizeof(double)), NULL, NULL); clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); + &(amdDevice.CommandQueue), 0, NULL, NULL); clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), - out, 0, NULL, NULL); + out, 0, NULL, NULL); clReleaseMemObject(scratchBuff); clReleaseMemObject(d_out); } -template<> +template <> int caffe_cpu_hamming_distance(const int n, const float* x, - const float* y) { + const float* y) { int dist = 0; for (int i = 0; i < n; ++i) { dist += __builtin_popcount(static_cast(x[i]) ^ - static_cast(y[i])); + static_cast(y[i])); } return dist; } -template<> +template <> int caffe_cpu_hamming_distance(const int n, const double* x, - const double* y) { + const double* y) { int dist = 0; for (int i = 0; i < n; ++i) { dist += __builtin_popcountl(static_cast(x[i]) ^ - static_cast(y[i])); + static_cast(y[i])); } return dist; } -template<> +template <> float caffe_cpu_asum(const int n, const float* x) { return cblas_sasum(n, x, 1); } -template<> +template <> double caffe_cpu_asum(const int n, const double* x) { return cblas_dasum(n, x, 1); } -template<> +template <> void caffe_gpu_asum(const int n, const float* x, float* y) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(cl_float)), NULL, NULL); + (n * sizeof(cl_float)), NULL, NULL); cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(cl_float)), NULL, NULL); + (1 * sizeof(cl_float)), NULL, NULL); clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); + &(amdDevice.CommandQueue), 0, NULL, NULL); clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, - 0, NULL, NULL); + 0, NULL, NULL); clReleaseMemObject(scratchBuff); clReleaseMemObject(d_y); } -template<> +template <> void caffe_gpu_asum(const int n, const double* x, double* y) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(cl_double)), NULL, NULL); + (n * sizeof(cl_double)), NULL, NULL); cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(cl_double)), NULL, NULL); + (1 * sizeof(cl_double)), NULL, NULL); clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); + &(amdDevice.CommandQueue), 0, NULL, NULL); clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), - y, 0, NULL, NULL); + y, 0, NULL, NULL); clReleaseMemObject(scratchBuff); clReleaseMemObject(d_y); } @@ -735,195 +743,195 @@ INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign); INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit); INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); -template<> +template <> void caffe_cpu_scale(const int n, const float alpha, const float *x, - float* y) { + float* y) { cblas_scopy(n, x, 1, y, 1); cblas_sscal(n, alpha, y, 1); } -template<> +template <> void caffe_cpu_scale(const int n, const double alpha, const double *x, - double* y) { + double* y) { cblas_dcopy(n, x, 1, y, 1); cblas_dscal(n, alpha, y, 1); } -template<> +template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { + float* y) { caffe_gpu_copy(n, x, y); caffe_gpu_scal(n, alpha, y); } -template<> +template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { + double* y) { caffe_gpu_copy(n, x, y); caffe_gpu_scal(n, alpha, y); } -template +template void set_kernel(const int n, const Dtype alpha, Dtype* y) { } -template<> +template <> void caffe_gpu_set(const int N, const float alpha, float* Y) { ocl_memset(Y, alpha, N); } -template<> +template <> void caffe_gpu_set(const int N, const double alpha, double* Y) { ocl_memset(Y, alpha, N); } -template<> +template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { kernel_add_scalar(N, alpha, Y); } -template<> +template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { kernel_add_scalar(N, alpha, Y); } -template<> +template <> void caffe_gpu_exp(const int N, const float* a, float* y) { kernel_exp(N, a, y); } -template<> +template <> void caffe_gpu_exp(const int N, const double* a, double* y) { kernel_exp(N, a, y); } -template<> +template <> void caffe_gpu_sign(const int N, const float *X, float *Y) { caffe_gpu_sign_ocl(N, X, Y); } -template<> +template <> void caffe_gpu_sign(const int N, const double *X, double *Y) { caffe_gpu_sign_ocl(N, X, Y); } -template<> +template <> void caffe_gpu_sub(const int N, const float* a, const float* b, - float* y) { + float* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_sub(N, a, b, y); } -template<> +template <> void caffe_gpu_sub(const int N, const double* a, const double* b, - double* y) { + double* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_sub(N, a, b, y); } -template<> +template <> void caffe_gpu_mul(const int N, const float* a, - const float* b, float* y) { + const float* b, float* y) { kernel_mul(N, a, b, y); } -template<> +template <> void caffe_gpu_mul(const int N, const double* a, - const double* b, double* y) { + const double* b, double* y) { kernel_mul(N, a, b, y); } -template<> +template <> void caffe_gpu_div(const int N, const float* a, - const float* b, float* y) { + const float* b, float* y) { kernel_div(N, a, b, y); } -template<> +template <> void caffe_gpu_div(const int N, const double* a, - const double* b, double* y) { + const double* b, double* y) { kernel_div(N, a, b, y); } -template<> +template <> void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { + const float alpha, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_powx(N, a, alpha, y); } -template<> +template <> void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { + const double alpha, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_powx(N, a, alpha, y); } void popc_kernel(const int n, const float* a, - const float* b, uint8_t* y) { + const float* b, uint8_t* y) { } void popcll_kernel(const int n, const double* a, - const double* b, uint8_t* y) { + const double* b, uint8_t* y) { } -template<> +template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, - const float* y) { + const float* y) { return 0; } -template<> +template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, - const double* y) { + const double* y) { return 0; } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { } -template<> +template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, - float* r) { + float* r) { } -template<> +template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, - double* r) { + double* r) { } -template<> +template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, - float* r) { + float* r) { } -template<> +template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, - double* r) { + double* r) { } -template<> +template <> void caffe_gpu_log(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_log(N, a, y); } -template<> +template <> void caffe_gpu_log(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_log(N, a, y); } -template<> +template <> void caffe_log(const int n, const float* a, float* y) { vsLn(n, a, y); } -template<> +template <> void caffe_log(const int n, const double* a, double* y) { vdLn(n, a, y); } -template +template void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { if (Caffe::mode() == Caffe::GPU) { @@ -941,47 +949,47 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) { template void caffe_copy(const int N, const int* X, int* Y); template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); + unsigned int* Y); template void caffe_copy(const int N, const float* X, float* Y); template void caffe_copy(const int N, const double* X, double* Y); -template<> +template <> void caffe_abs(const int n, const float* a, float* y) { vsAbs(n, a, y); } -template<> +template <> void caffe_abs(const int n, const double* a, double* y) { vdAbs(n, a, y); } -template<> +template <> void caffe_gpu_add(const int N, const float* a, const float* b, - float* y) { + float* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_add(N, a, b, y); } -template<> +template <> void caffe_gpu_add(const int N, const double* a, const double* b, - double* y) { + double* y) { // NOLINT_NEXT_LINE(whitespace/operators) kernel_add(N, a, b, y); } -template<> +template <> float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { + const float* y, const int incy) { return cblas_sdot(n, x, incx, y, incy); } -template<> +template <> double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { + const int incx, const double* y, const int incy) { return cblas_ddot(n, x, incx, y, incy); } -template +template void caffe_set(const int N, const Dtype alpha, Dtype* Y) { if (alpha == 0) { memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu index 1bf783e4..64245bea 100644 --- a/src/caffe/util/math_functions.cu +++ b/src/caffe/util/math_functions.cu @@ -12,67 +12,67 @@ namespace caffe { -template<> +template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } -template<> +template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + A, N, x, 1, &beta, y, 1)); } -template<> +template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + A, N, x, 1, &beta, y, 1)); } -template<> +template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { + float* Y) { CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } -template<> +template <> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { + double* Y) { CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } @@ -82,62 +82,62 @@ void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) { } } -template<> +template <> void caffe_gpu_scal(const int N, const float alpha, float *X) { CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } -template<> +template <> void caffe_gpu_scal(const int N, const double alpha, double *X) { CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } -template<> +template <> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { + const float beta, float* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } -template<> +template <> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { + const double beta, double* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } -template<> +template <> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { + float* out) { CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } -template<> +template <> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { + double * out) { CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } -template<> +template <> void caffe_gpu_asum(const int n, const float* x, float* y) { CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); } -template<> +template <> void caffe_gpu_asum(const int n, const double* x, double* y) { CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); } -template<> +template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { + float* y) { CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } -template<> +template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { + double* y) { CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } @@ -149,7 +149,7 @@ __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) { } } -template +template void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { if (alpha == 0) { CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) @@ -157,7 +157,7 @@ void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { } // NOLINT_NEXT_LINE(whitespace/operators) set_kernel<<>>( - N, alpha, Y); + N, alpha, Y); } template void caffe_gpu_set(const int N, const int alpha, int* Y); @@ -171,14 +171,14 @@ CUDA_KERNEL_LOOP(index, n) { } } -template<> +template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { // NOLINT_NEXT_LINE(whitespace/operators) add_scalar_kernel<<>>( -N, alpha, Y); + N, alpha, Y); } -template<> +template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { // NOLINT_NEXT_LINE(whitespace/operators) add_scalar_kernel<<>>( @@ -193,7 +193,7 @@ y[index] = a[index] + b[index]; } } -template<> +template <> void caffe_gpu_add(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -201,7 +201,7 @@ add_kernel<<>>( N, a, b, y); } -template<> +template <> void caffe_gpu_add(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -217,7 +217,7 @@ y[index] = a[index] - b[index]; } } -template<> +template <> void caffe_gpu_sub(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -225,7 +225,7 @@ sub_kernel<<>>( N, a, b, y); } -template<> +template <> void caffe_gpu_sub(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -241,7 +241,7 @@ y[index] = a[index] * b[index]; } } -template<> +template <> void caffe_gpu_mul(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -249,7 +249,7 @@ mul_kernel<<>>( N, a, b, y); } -template<> +template <> void caffe_gpu_mul(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -265,7 +265,7 @@ y[index] = a[index] / b[index]; } } -template<> +template <> void caffe_gpu_div(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -273,7 +273,7 @@ div_kernel<<>>( N, a, b, y); } -template<> +template <> void caffe_gpu_div(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -288,14 +288,14 @@ y[index] = abs(a[index]); } } -template<> +template <> void caffe_gpu_abs(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) abs_kernel<<>>( N, a, y); } -template<> +template <> void caffe_gpu_abs(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) abs_kernel<<>>( @@ -309,14 +309,14 @@ y[index] = exp(a[index]); } } -template<> +template <> void caffe_gpu_exp(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) exp_kernel<<>>( N, a, y); } -template<> +template <> void caffe_gpu_exp(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) exp_kernel<<>>( @@ -330,14 +330,14 @@ y[index] = log(a[index]); } } -template<> +template <> void caffe_gpu_log(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) log_kernel<<>>( N, a, y); } -template<> +template <> void caffe_gpu_log(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) log_kernel<<>>( @@ -352,7 +352,7 @@ y[index] = pow(a[index], alpha); } } -template<> +template <> void caffe_gpu_powx(const int N, const float* a, const float alpha, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -360,7 +360,7 @@ powx_kernel<<>>( N, a, alpha, y); } -template<> +template <> void caffe_gpu_powx(const int N, const double* a, const double alpha, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -390,28 +390,28 @@ static_cast(b[index])); } } -template<> +template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, const float* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test // TestHammingDistanceGPU in test_math_functions.cpp). NOT_IMPLEMENTED; thrust::device_vector < uint8_t > popcounts(n); - // NOLINT_NEXT_LINE(whitespace/operators) + // NOLINT_NEXT_LINE(whitespace/operators) popc_kernel<<>>( n, x, y, thrust::raw_pointer_cast(popcounts.data())); return thrust::reduce(popcounts.begin(), popcounts.end(), (uint32_t) 0, thrust::plus()); } -template<> +template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, const double* y) { - // TODO: Fix caffe_gpu_hamming_distance (see failing unit test + // TODO: Fix caffe_gpu_hamming_distance (see failing unit test // TestHammingDistanceGPU in test_math_functions.cpp). NOT_IMPLEMENTED; thrust::device_vector < uint8_t > popcounts(n); - // NOLINT_NEXT_LINE(whitespace/operators) + // NOLINT_NEXT_LINE(whitespace/operators) popcll_kernel<<>>( n, x, y, thrust::raw_pointer_cast(popcounts.data())); return thrust::reduce(popcounts.begin(), popcounts.end(), @@ -423,7 +423,7 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) { CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); } -template<> +template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, float* r) { CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); @@ -436,7 +436,7 @@ caffe_gpu_add_scalar(n, a, r); } } -template<> +template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, double* r) { CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); @@ -449,14 +449,14 @@ caffe_gpu_add_scalar(n, a, r); } } -template<> +template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, float* r) { CURAND_CHECK( curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); } -template<> +template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) { CURAND_CHECK( diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 6b8c5fee..8f44a106 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -32,9 +32,9 @@ #include "caffe/common.hpp" #include "caffe/util/ocl_util.hpp" namespace caffe { -template extern std::string get_dtype_suffix(); +template extern std::string get_dtype_suffix(); -template +template void ocl_memset(Dtype* buffer, const Dtype value, const int count) { std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -47,19 +47,19 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count) { size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ocl_memset(int* buffer, const int value, const int count); template void ocl_memset(float* buffer, const float value, - const int count); + const int count); template void ocl_memset(double* buffer, const double value, - const int count); + const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, - const int count) { + const int count) { cl_int err; err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); @@ -69,8 +69,8 @@ void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } @@ -79,11 +79,11 @@ void eventCallback(cl_event event, cl_int event_status, void* user_data) { cl_ulong ev_end_time = (cl_ulong) 0; double run_time; OCL_CHECK( - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, - sizeof(cl_ulong), &ev_start_time, NULL)); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &ev_start_time, NULL)); OCL_CHECK( - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), - &ev_end_time, NULL)); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), + &ev_end_time, NULL)); run_time = (double) (ev_end_time - ev_start_time); printf("The kernel's running time is %f s\n", run_time * 1.0e-9); } diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index be0c5894..8eb1a981 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -37,10 +37,9 @@ typedef unsigned int uint32_t; struct array4x32 { uint32_t v[4]; }; -template +template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, - Dtype threshold) - { + Dtype threshold) { std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); @@ -63,17 +62,18 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, size_t globalws[1] = { size }; size_t localws[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, - localws, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, + localws, 0, NULL, NULL)); } template void caffe_gpu_bernoulli(int* a, const unsigned int n, - float inf, float sup, float threshold); + float inf, float sup, float threshold); template void caffe_gpu_bernoulli(int* a, const unsigned int n, - double inf, double sup, double threshold); + double inf, double sup, double threshold); -template +template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, - const int M_, const int packing_num) { + const int M_, const int packing_num) { std::string kernel_name = "transform" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -89,18 +89,18 @@ void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) }; size_t uiLocal_Work_Size2[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); } template void transform_gpu(float* src, float* dst, const int top_offset, - const int N_, const int M_, const int packing_num); + const int N_, const int M_, const int packing_num); template void transform_gpu(double* src, double* dst, - const int top_offset, const int N_, const int M_, const int packing_num); + const int top_offset, const int N_, const int M_, const int packing_num); -template +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* bottom_data, Dtype* scale_data) { + const Dtype* bottom_data, Dtype* scale_data) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data)); @@ -109,16 +109,16 @@ void get_max_gpu(cl_kernel Kernel, const int num, const int dim, size_t Global_Work_Size[1] = { (size_t) num }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const float* bottom_data, float* scale_data); + const float* bottom_data, float* scale_data); template void get_max_gpu(cl_kernel Kernel, const int num, - const int dim, const double* bottom_data, double* scale_data); + const int dim, const double* bottom_data, double* scale_data); -template +template void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); @@ -127,18 +127,18 @@ void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) { size_t Global_Work_Size[1] = { (size_t) num }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void exp_gpu(cl_kernel Kernel, const int num, const float* data, - float* out); + float* out); template void exp_gpu(cl_kernel Kernel, const int num, - const double* data, double* out); + const double* data, double* out); -template +template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* scale, Dtype* data) { + const Dtype* scale, Dtype* data) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale)); @@ -147,18 +147,18 @@ void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, size_t Global_Work_Size[1] = { (size_t)(num * dim) }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void softmax_div_gpu(cl_kernel Kernel, const int num, - const int dim, const float* scale, float* data); + const int dim, const float* scale, float* data); template void softmax_div_gpu(cl_kernel Kernel, const int num, - const int dim, const double* scale, double* data); + const int dim, const double* scale, double* data); -template +template Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { + const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss)); @@ -170,26 +170,25 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, size_t globalws[1] = { 256 }; size_t localws[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, - localws, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, + localws, 0, NULL, NULL)); void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, - CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); + CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); Dtype loss = *(Dtype*) h_loss; clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, - NULL); + NULL); return loss; } template float softmax_gpu(cl_kernel Kernel, const int num, - const int dim, const float* prob_data, const float* label, cl_mem d_loss); + const int dim, const float* prob_data, const float* label, cl_mem d_loss); template double softmax_gpu(cl_kernel Kernel, const int num, - const int dim, const double* prob_data, const double* label, cl_mem d_loss); + const int dim, const double* prob_data, const double* label, cl_mem d_loss); -template +template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) - { + const int spatial_dim, const Dtype* data, Dtype* out) { std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -202,22 +201,21 @@ void kernel_channel_max(const int num, const int channels, size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const float* data, float* out); + const int spatial_dim, const float* data, float* out); template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const double* data, double* out); + const int spatial_dim, const double* data, double* out); -template +template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) - { + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data) { std::string kernel_name = "kernel_channel_subtract" - + get_dtype_suffix(); + + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); @@ -230,20 +228,20 @@ void kernel_channel_subtract(const int count, size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const float* channel_max, float* data); + const int num, const int channels, + const int spatial_dim, const float* channel_max, float* data); template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const double* channel_max, double* data); + const int num, const int channels, + const int spatial_dim, const double* channel_max, double* data); -template +template void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) - { + { std::string kernel_name = "kernel_mul" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -255,18 +253,17 @@ void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_mul(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_mul(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); -template -void kernel_add_scalar(const int count, const Dtype data, Dtype* out) - { +template +void kernel_add_scalar(const int count, const Dtype data, Dtype* out) { std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -277,19 +274,18 @@ void kernel_add_scalar(const int count, const Dtype data, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_add_scalar(const int count, const float data, - float* out); + float* out); template void kernel_add_scalar(const int count, const double data, - double* out); + double* out); -template +template void kernel_powx(const int count, const Dtype* data, const Dtype alpha, - Dtype* out) - { + Dtype* out) { std::string kernel_name = "kernel_powx" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -301,18 +297,17 @@ void kernel_powx(const int count, const Dtype* data, const Dtype alpha, size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_powx(const int count, const float* data, - const float alpha, float* out); + const float alpha, float* out); template void kernel_powx(const int count, const double* data, - const double alpha, double* out); + const double alpha, double* out); -template -void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) - { +template +void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) { std::string kernel_name = "kernel_div" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -324,18 +319,17 @@ void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_div(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_div(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); -template -void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) - { +template +void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) { std::string kernel_name = "kernel_add" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -347,18 +341,17 @@ void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_add(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_add(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); -template -void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) - { +template +void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) { std::string kernel_name = "kernel_sub" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -370,18 +363,17 @@ void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_sub(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_sub(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); -template -void kernel_log(const int count, const Dtype* data, Dtype* out) - { +template +void kernel_log(const int count, const Dtype* data, Dtype* out) { std::string kernel_name = "kernel_log" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -392,17 +384,16 @@ void kernel_log(const int count, const Dtype* data, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_log(const int count, const float* data, float* out); template void kernel_log(const int count, const double* data, - double* out); + double* out); -template -void kernel_exp(const int count, const Dtype* data, Dtype* out) - { +template +void kernel_exp(const int count, const Dtype* data, Dtype* out) { std::string kernel_name = "kernel_exp" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -413,18 +404,17 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out) size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_exp(const int count, const float* data, float* out); template void kernel_exp(const int count, const double* data, - double* out); + double* out); -template +template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) - { + const int spatial_dim, const Dtype* data, Dtype* channel_sum) { std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -437,19 +427,18 @@ void kernel_channel_sum(const int num, const int channels, size_t Global_Work_Size[1] = { (size_t)(num * channels) }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const float* data, float* channel_sum); + const int spatial_dim, const float* data, float* channel_sum); template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const double* data, double* channel_sum); + const int spatial_dim, const double* data, double* channel_sum); -template +template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) - { + const int spatial_dim, const Dtype* channel_sum, Dtype* data) { std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -463,22 +452,21 @@ void kernel_channel_div(const int count, const int num, const int channels, size_t Global_Work_Size[1] = { (size_t) count }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_div(const int count, const int num, - const int channels, - const int spatial_dim, const float* channel_sum, float* data); + const int channels, + const int spatial_dim, const float* channel_sum, float* data); template void kernel_channel_div(const int count, const int num, - const int channels, - const int spatial_dim, const double* channel_sum, double* data); + const int channels, + const int spatial_dim, const double* channel_sum, double* data); -template +template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) - { + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot) { std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -492,24 +480,23 @@ void kernel_channel_dot(const int num, const int channels, size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const float* data_1, const float* data_2, - float* channel_dot); + const int spatial_dim, const float* data_1, const float* data_2, + float* channel_dot); template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const double* data_1, const double* data_2, - double* channel_dot); + const int spatial_dim, const double* data_1, const double* data_2, + double* channel_dot); -template +template void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) - { + const Dtype* prob_data, const Dtype* label, Dtype* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, + Dtype* counts) { std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -521,34 +508,33 @@ void SoftmaxLossForwardGPU(const int nthreads, OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); OCL_CHECK( - clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); size_t Global_Work_Size[1] = { (size_t) nthreads }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SoftmaxLossForwardGPU(const int nthreads, - const float* prob_data, const float* label, float* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, float* counts); + const float* prob_data, const float* label, float* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, float* counts); template void SoftmaxLossForwardGPU(const int nthreads, - const double* prob_data, const double* label, double* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, double* counts); + const double* prob_data, const double* label, double* loss, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, double* counts); -template +template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) - { + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) { std::string kernel_name = "SoftmaxLossBackwardGPU" - + get_dtype_suffix(); + + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); @@ -559,27 +545,27 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); OCL_CHECK( - clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); size_t Global_Work_Size[1] = { (size_t) nthreads }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SoftmaxLossBackwardGPU(const int nthreads, - const float* top, const float* label, float* bottom_diff, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, float* counts); + const float* top, const float* label, float* bottom_diff, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, float* counts); template void SoftmaxLossBackwardGPU(const int nthreads, - const double* top, const double* label, double* bottom_diff, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, double* counts); + const double* top, const double* label, double* bottom_diff, + const int num, const int dim, const int spatial_dim, + const bool has_ignore_label_, const int ignore_label_, double* counts); -template +template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha)); @@ -588,18 +574,18 @@ void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) { size_t Global_Work_Size[1] = { (size_t) num }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void scal_gpu(cl_kernel Kernel, const int num, - const float alpha, float* data); + const float alpha, float* data); template void scal_gpu(cl_kernel Kernel, const int num, - const double alpha, double* data); + const double alpha, double* data); -template +template void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, - const Dtype* label) { + const Dtype* label) { OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); @@ -608,21 +594,21 @@ void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, size_t Global_Work_Size[1] = { (size_t) num }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void diff_gpu(cl_kernel Kernel, const int num, const int dim, - float* data, const float* label); + float* data, const float* label); template void diff_gpu(cl_kernel Kernel, const int num, const int dim, - double* data, const double* label); + double* data, const double* label); -template +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - Dtype* top_data) { + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data) { cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); @@ -640,28 +626,28 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + float* top_data); template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + double* top_data); -template +template void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, - Dtype* top_mask) { + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask) { std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -688,30 +674,29 @@ void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxPoolForward(const int count, const float* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, float* top_data, int* mask, - float* top_mask); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data, int* mask, + float* top_mask); template void MaxPoolForward(const int count, const double* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, double* top_data, int* mask, - double* top_mask); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data, int* mask, + double* top_mask); -template +template void StoPoolForwardTrain(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* idx_data, Dtype* top_data) - { + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data) { std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -734,26 +719,27 @@ void StoPoolForwardTrain(const int count, const Dtype* bottom_data, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void StoPoolForwardTrain(const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, float* idx_data, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* idx_data, float* top_data); template void StoPoolForwardTrain(const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, double* idx_data, double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* idx_data, + double* top_data); -template +template void StoPoolForwardTest(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* top_data) { + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data) { std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -776,27 +762,27 @@ void StoPoolForwardTest(const int count, const Dtype* bottom_data, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void StoPoolForwardTest(const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* top_data); template void StoPoolForwardTest(const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* top_data); -template +template void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data) { + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data) { std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -820,26 +806,26 @@ void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void AvePoolForward(const int count, const float* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, float* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data); template void AvePoolForward(const int count, const double* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, double* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data); -template +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, Dtype* top_data) { + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data) { cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); @@ -858,27 +844,27 @@ void ave_pool_fp_gpu(cl_kernel Kernel, const int count, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* top_data); -template +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, Dtype* bottom_diff) { + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff) { cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); @@ -898,28 +884,28 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const float* top_data, const float* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, float* bottom_diff); + const float* bottom_data, const float* top_data, const float* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, float* bottom_diff); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const double* top_data, const double* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, double* bottom_diff); + const double* bottom_data, const double* top_data, const double* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, double* bottom_diff); -template +template void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -945,32 +931,31 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void MaxPoolBackward(const int nthreads, - const float* const top_diff, const int* const mask, - const float* const top_mask, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - float* const bottom_diff); + const float* const top_diff, const int* const mask, + const float* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); template void MaxPoolBackward(const int nthreads, - const double* const top_diff, const int* const mask, - const double* const top_mask, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - double* const bottom_diff); - -template + const double* const top_diff, const int* const mask, + const double* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); + +template void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) - { + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -995,28 +980,28 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void AvePoolBackward(const int nthreads, - const float* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - float* const bottom_diff); + const float* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); template void AvePoolBackward(const int nthreads, - const double* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - double* const bottom_diff); + const double* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); -template +template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, - const Dtype* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, Dtype* const bottom_diff) { + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff) { std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1039,27 +1024,27 @@ void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void StoPoolBackward(const int nthreads, - const float* const rand_idx, const float* const top_diff, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, - float* const bottom_diff); + const float* const rand_idx, const float* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + float* const bottom_diff); template void StoPoolBackward(const int nthreads, - const double* const rand_idx, const double* const top_diff, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, - double* const bottom_diff); + const double* const rand_idx, const double* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + double* const bottom_diff); -template +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, const int pad_, Dtype* bottom_diff) { + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff) { cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); @@ -1078,25 +1063,25 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, - const float* top_diff, const int clnum, const int channels_, - const int intheight_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, float* bottom_diff); + const float* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, - const double* top_diff, const int clnum, const int channels_, - const int intheight_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, double* bottom_diff); + const double* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* bottom_diff); -template +template void PReLUForward(const int count, const int channels, const int dim, - const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, - const int div_factor) { + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor) { std::string kernel_name = "PReLUForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1110,20 +1095,20 @@ void PReLUForward(const int count, const int channels, const int dim, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUForward(const int count, const int channels, - const int dim, const float* bottom_data, float* top_data, - const float* slope_data, const int div_factor); + const int dim, const float* bottom_data, float* top_data, + const float* slope_data, const int div_factor); template void PReLUForward(const int count, const int channels, - const int dim, const double* bottom_data, double* top_data, - const double* slope_data, const int div_factor); + const int dim, const double* bottom_data, double* top_data, + const double* slope_data, const int div_factor); -template +template void PReLUBackward(const int count, const int channels, const int dim, - const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, - const Dtype* slope_data, const int div_factor) { + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor) { std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1138,20 +1123,20 @@ void PReLUBackward(const int count, const int channels, const int dim, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUBackward(const int count, const int channels, - const int dim, const float* top_diff, const float* bottom_data, - float* bottom_diff, const float* slope_data, const int div_factor); + const int dim, const float* top_diff, const float* bottom_data, + float* bottom_diff, const float* slope_data, const int div_factor); template void PReLUBackward(const int count, const int channels, - const int dim, const double* top_diff, const double* bottom_data, - double* bottom_diff, const double* slope_data, const int div_factor); + const int dim, const double* top_diff, const double* bottom_data, + double* bottom_diff, const double* slope_data, const int div_factor); -template +template void PReLUParamBackward(const int count, const Dtype* top_diff, - const int offset_out, const Dtype* bottom_data, const int offset_in, - Dtype* bottom_diff) { + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff) { std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1164,19 +1149,19 @@ void PReLUParamBackward(const int count, const Dtype* top_diff, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUParamBackward(const int count, const float* top_diff, - const int offset_out, const float* bottom_data, const int offset_in, - float* bottom_diff); + const int offset_out, const float* bottom_data, const int offset_in, + float* bottom_diff); template void PReLUParamBackward(const int count, - const double* top_diff, const int offset_out, const double* bottom_data, - const int offset_in, double* bottom_diff); + const double* top_diff, const int offset_out, const double* bottom_data, + const int offset_in, double* bottom_diff); -template +template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, - Dtype negative_slope) { + Dtype negative_slope) { std::string kernel_name = "ReLUForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1188,18 +1173,18 @@ void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ReLUForward(const int count, const float* bottom_data, - float* top_data, float negative_slope); + float* top_data, float negative_slope); template void ReLUForward(const int count, const double* bottom_data, - double* top_data, double negative_slope); + double* top_data, double negative_slope); -template +template void ReLUBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -1214,17 +1199,17 @@ void ReLUBackward(const int count, const Dtype* top_diff, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ReLUBackward(const int count, const float* top_diff, - const float* bottom_data, float* bottom_diff, float negative_slope); + const float* bottom_data, float* bottom_diff, float negative_slope); template void ReLUBackward(const int count, const double* top_diff, - const double* bottom_data, double* bottom_diff, double negative_slope); + const double* bottom_data, double* bottom_diff, double negative_slope); -template +template void SigmoidForward(const int count, const Dtype* bottom_data, - Dtype* top_data) { + Dtype* top_data) { std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1235,18 +1220,18 @@ void SigmoidForward(const int count, const Dtype* bottom_data, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SigmoidForward(const int count, const float* bottom_data, - float* top_data); + float* top_data); template void SigmoidForward(const int count, const double* bottom_data, - double* top_data); + double* top_data); -template +template void SigmoidBackward(const int count, const Dtype* top_diff, - const Dtype* top_data, Dtype* bottom_diff) { + const Dtype* top_data, Dtype* bottom_diff) { std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -1260,17 +1245,17 @@ void SigmoidBackward(const int count, const Dtype* top_diff, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void SigmoidBackward(const int count, const float* top_diff, - const float* top_data, float* bottom_diff); + const float* top_data, float* bottom_diff); template void SigmoidBackward(const int count, const double* top_diff, - const double* top_data, double* bottom_diff); + const double* top_data, double* bottom_diff); -template +template void ThresholdForward(const int count, const Dtype threshold, - const Dtype* bottom_data, Dtype* top_data) { + const Dtype* bottom_data, Dtype* top_data) { std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int ret; @@ -1282,16 +1267,16 @@ void ThresholdForward(const int count, const Dtype threshold, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ThresholdForward(const int count, const float threshold, - const float* bottom_data, float* top_data); + const float* bottom_data, float* top_data); template void ThresholdForward(const int count, const double threshold, - const double* bottom_data, double* top_data); + const double* bottom_data, double* top_data); -template +template void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) { std::string kernel_name = "TanHForward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -1303,18 +1288,18 @@ void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) { size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void TanHForward(const int count, const float* bottom_data, - float* top_data); + float* top_data); template void TanHForward(const int count, const double* bottom_data, - double* top_data); + double* top_data); -template +template void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, - Dtype* bottom_diff) { + Dtype* bottom_diff) { std::string kernel_name = "TanHBackward" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); @@ -1328,381 +1313,441 @@ void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, size_t uiGlobal_Work_Size[] = { (size_t) count }; size_t uiLocal_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void TanHBackward(const int count, const float* top_diff, - const float* top_data, float* bottom_diff); + const float* top_data, float* bottom_diff); template void TanHBackward(const int count, const double* top_diff, - const double* top_data, double* bottom_diff); + const double* top_data, double* bottom_diff); -template +template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) { - std::string kernel_name = "opttrans" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int num_kernels = channels * height * width * optnum; + int num_kernels = channels * height * width * optnum; - cl_int ret; - ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels); - ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im); - ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset); - ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt); - ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset); - ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum); - OCL_CHECK(ret); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = {(size_t)num_kernels}; - size_t uiLocal_Work_Size[] = {256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) ); + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } -template void opttrans(const float* data_im, const int im_offset, const int channels, - const int height, const int width, float* data_opt, const int opt_offset, const int optnum); -template void opttrans(const double* data_im, const int im_offset, const int channels, - const int height, const int width, double* data_opt, const int opt_offset, const int optnum); +template void opttrans(const float* data_im, const int im_offset, + const int channels, + const int height, const int width, float* data_opt, const int opt_offset, + const int optnum); +template void opttrans(const double* data_im, const int im_offset, + const int channels, + const int height, const int width, double* data_opt, const int opt_offset, + const int optnum); template void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale){ - std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); - cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in); - ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num); - ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size); - ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k); - ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={(size_t)nthreads}; - size_t uiLocal_Work_Size[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) ); + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha_over_size, + const Dtype k, Dtype* const scale) { + std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); + cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); + ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); + ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void LRNFillScale(const int nthreads, const float* const in, - const int num, const int channels, const int height, - const int width, const int size, const float alpha_over_size, - const float k, float* const scale); + const int num, const int channels, const int height, + const int width, const int size, const float alpha_over_size, + const float k, float* const scale); template void LRNFillScale(const int nthreads, const double* const in, - const int num, const int channels, const int height, - const int width, const int size, const double alpha_over_size, - const double k, double* const scale); + const int num, const int channels, const int height, + const int width, const int size, const double alpha_over_size, + const double k, double* const scale); template void LRNComputeOutput(int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out){ - std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); - cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in); - ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale); - ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta); - ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[]={(size_t)nthreads}; - size_t uiLocal_Work_Size2[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) ); + Dtype* scale, Dtype negative_beta, Dtype* out) { + std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); + cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); } template void LRNComputeOutput(int nthreads, const float* in, - float* scale, float negative_beta, float* out); + float* scale, float negative_beta, float* out); template void LRNComputeOutput(int nthreads, const double* in, - double* scale, double negative_beta, double* out); + double* scale, double negative_beta, double* out); template void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff){ - std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); - cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads); - ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data); - ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data); - ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale); - ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff); - ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num); - ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels); - ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height); - ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width); - ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size); - ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta); - ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio); - ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[]={(size_t)nthreads}; - size_t uiLocal_Work_Size[]={256}; - OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) ); + const Dtype* const bottom_data, const Dtype* const top_data, + const Dtype* const scale, const Dtype* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, Dtype* const bottom_diff) { + std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); + cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); + ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void LRNComputeDiff(const int nthreads, - const float* const bottom_data, const float* const top_data, - const float* const scale, const float* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const float negative_beta, - const float cache_ratio, float* const bottom_diff); + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const float negative_beta, + const float cache_ratio, float* const bottom_diff); template void LRNComputeDiff(const int nthreads, - const double* const bottom_data, const double* const top_data, - const double* const scale, const double* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const double negative_beta, - const double cache_ratio, double* const bottom_diff); - -template -void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){ - std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add (const int n, const float* in1, const float* in2, float* y); -template void caffe_gpu_add (const int n, const double* in1, const double* in2, double* y); - -template -void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y ){ - std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)N}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y ); -template void caffe_gpu_sign_ocl(const int N, const double* X, double* Y ); - -template -void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ){ - std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)N}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y ); -template void caffe_gpu_abs_ocl(const int N, const double* X, double* Y ); - -template -void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = "div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_div (const int n, const float* a, const float* b, float* y); -template void caffe_gpu_div (const int n, const double* a, const double* b, double* y); - -template -void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){ - std::string kernel_name = "add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_add_scalar (const int n, const float alpha, float* top_data); -template void caffe_gpu_add_scalar (const int n, const double alpha, double* top_data); - -template -void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){ - std::string kernel_name = "element_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_mul (const int n, const float* a, const float* b, float* y); -template void caffe_gpu_mul (const int n, const double* a, const double* b, double* y); - -template -void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){ - std::string kernel_name = "powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = {(size_t)n}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void caffe_gpu_powx (const int n, const float* a, const float alpha, float* y); -template void caffe_gpu_powx (const int n, const double* a, const double alpha, double* y); - -template -void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data) -{ - std::string kernel_name = "DropoutForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count); - ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data); - ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem); - ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_); - ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); -} - -template void DropoutForward(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data); -template void DropoutForward(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data); + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, + const int num, const int channels, const int height, + const int width, const int size, const double negative_beta, + const double cache_ratio, double* const bottom_diff); template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff) -{ - std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&MaskMem); - ret |= clSetKernelArg(kernel,3,sizeof(cl_int), (void*)&threshold_); - ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_); - ret |= clSetKernelArg(kernel,5,sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); +template void caffe_gpu_add(const int n, const float* in1, + const float* in2, float* y); +template void caffe_gpu_add(const int n, const double* in1, + const double* in2, double* y); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void DropoutBackward(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff); -template void DropoutBackward(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff); +template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_sign_ocl(const int N, const double* X, + double* Y); template -void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) -{ - std::string kernel_name = "BNLLForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&top_data); - OCL_CHECK(ret); +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_abs_ocl(const int N, const double* X, + double* Y); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +template +void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void BNLLForward(const int count, const float* bottom_data, float *top_data); -template void BNLLForward(const int count, const double* bottom_data, double *top_data); -template -void BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff) -{ - std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0,sizeof(cl_int), (void*)&count); - ret |= clSetKernelArg(kernel,1,sizeof(cl_mem), (void*)&top_diff); - ret |= clSetKernelArg(kernel,2,sizeof(cl_mem), (void*)&bottom_data); - ret |= clSetKernelArg(kernel,3,sizeof(cl_mem), (void*)&bottom_diff); - OCL_CHECK(ret); +template void caffe_gpu_div(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_div(const int n, const double* a, + const double* b, double* y); - size_t Global_Work_Size[] = {(size_t)count}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +template +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) { + std::string kernel_name = "add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add_scalar(const int n, const float alpha, + float* top_data); +template void caffe_gpu_add_scalar(const int n, const double alpha, + double* top_data); + +template +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "element_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_mul(const int n, const double* a, + const double* b, double* y); + +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { + std::string kernel_name = "powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx(const int n, const float* a, + const float alpha, float* y); +template void caffe_gpu_powx(const int n, const double* a, + const double alpha, double* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, + const int* MaskMem, const Dtype scale_, Dtype* top_data) { + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void DropoutForward(const int count, const float* bottom_data, + const int* MaskMem, const float scale_, float* top_data); +template void DropoutForward(const int count, const double* bottom_data, + const int* MaskMem, const double scale_, double* top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, + const float threshold_, const Dtype scale_, Dtype* bottom_diff) { + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void BNLLBackward(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff); -template void BNLLBackward(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff); - - -template -void Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data) -{ - std::string kernel_name = "Concat" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - int k_forward = (forward == true)? 1 : 0; - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&in_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&k_forward); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&num_concats); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&concat_size); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&top_concat_axis); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&bottom_concat_axis); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&offset_concat_axis); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&out_data); - OCL_CHECK(ret); +template void DropoutBackward(const int count, const float* top_diff, + const int* MaskMem, const float threshold_, const float scale_, + float* bottom_diff); +template void DropoutBackward(const int count, const double* top_diff, + const int* MaskMem, const float threshold_, const double scale_, + double* bottom_diff); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) { + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); - size_t Global_Work_Size[] = {(size_t)nthreads}; - size_t Local_Work_Size[] = {256}; - OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void Concat(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data); -template void Concat(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data); +template void BNLLForward(const int count, const float* bottom_data, + float *top_data); +template void BNLLForward(const int count, const double* bottom_data, + double *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff) { + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLBackward(const int count, const float* top_diff, + const float* bottom_data, float *bottom_diff); +template void BNLLBackward(const int count, const double* top_diff, + const double* bottom_data, double *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, Dtype *out_data) { + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true) ? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Concat(const int nthreads, const float* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, float *out_data); +template void Concat(const int nthreads, const double* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, double *out_data); template void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) - { + const Dtype margin, const bool legacy_version, const Dtype alpha, + const Dtype* y, const Dtype* diff, const Dtype* dist_sq, + Dtype *bottom_diff) { std::string kernel_name = "CLLBackward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); @@ -1721,23 +1766,22 @@ void CLLBackward(const int count, const int channels, size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - const float* y, const float* diff, const float* dist_sq, - float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + const float* y, const float* diff, const float* dist_sq, + float *bottom_diff); template void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - const double* y, const double* diff, const double* dist_sq, - double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + const double* y, const double* diff, const double* dist_sq, + double *bottom_diff); -template +template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) - { + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask) { std::string kernel_name = "MaxForward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); @@ -1753,21 +1797,20 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a, size_t Global_Work_Size[] = { (size_t) nthreads }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxForward(const int nthreads, const float* bottom_data_a, - const float* bottom_data_b, const int blob_idx, float* top_data, - int* mask); + const float* bottom_data_b, const int blob_idx, float* top_data, + int* mask); template void MaxForward(const int nthreads, - const double* bottom_data_a, - const double* bottom_data_b, const int blob_idx, double* top_data, - int* mask); + const double* bottom_data_a, + const double* bottom_data_b, const int blob_idx, double* top_data, + int* mask); -template +template void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) - { + const int blob_idx, const int* mask, Dtype* bottom_diff) { std::string kernel_name = "MaxBackward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); @@ -1782,27 +1825,27 @@ void MaxBackward(const int nthreads, const Dtype* top_diff, size_t Global_Work_Size[] = { (size_t) nthreads }; size_t Local_Work_Size[] = { 256 }; OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxBackward(const int nthreads, const float* top_diff, - const int blob_idx, const int* mask, float* bottom_diff); + const int blob_idx, const int* mask, float* bottom_diff); template void MaxBackward(const int nthreads, const double* top_diff, - const int blob_idx, const int* mask, double* bottom_diff); + const int blob_idx, const int* mask, double* bottom_diff); -template +template void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, - int channel_in, int width, int height, int channel_out, int width_out, - int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz) - { + int channel_in, int width, int height, int channel_out, int width_out, + int height_out, int kernel_w, int kernel_h, int stride, int pad, + int batch_sz) { } template void ocl_conv(float* bottom_data, float* top_data, - float* weights, float* bias, int channel_in, int width, int height, - int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, - int stride, int pad, int batch_sz); + float* weights, float* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); template void ocl_conv(double* bottom_data, double* top_data, - double* weights, double* bias, int channel_in, int width, int height, - int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, - int stride, int pad, int batch_sz); + double* weights, double* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); } // namespace caffe diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp index f4373901..da533cd9 100644 --- a/src/caffe/util/upgrade_proto.cpp +++ b/src/caffe/util/upgrade_proto.cpp @@ -30,7 +30,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) { } bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, - NetParameter* net_param) { + NetParameter* net_param) { // First upgrade padding layers to padded conv layers. NetParameter v0_net_param; UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); @@ -42,7 +42,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, } for (int i = 0; i < v0_net_param.layers_size(); ++i) { is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), - net_param->add_layers()); + net_param->add_layers()); } for (int i = 0; i < v0_net_param.input_size(); ++i) { net_param->add_input(v0_net_param.input(i)); @@ -57,7 +57,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, } void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad) { + NetParameter* param_upgraded_pad) { // Copy everything other than the layers from the original param. param_upgraded_pad->Clear(); param_upgraded_pad->CopyFrom(param); @@ -78,7 +78,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param, for (int j = 0; j < layer_connection.bottom_size(); ++j) { const string& blob_name = layer_connection.bottom(j); if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { + blob_name_to_last_top_idx.end()) { LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; } const int top_idx = blob_name_to_last_top_idx[blob_name]; @@ -92,20 +92,20 @@ void UpgradeV0PaddingLayers(const NetParameter& param, // the padding layer input has only one input and one output. Other // cases have undefined behavior in Caffe. CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) - << "Padding layer input to " - "non-convolutional / non-pooling layer type " - << layer_param.type(); + << "Padding layer input to " + "non-convolutional / non-pooling layer type " + << layer_param.type(); CHECK_EQ(layer_connection.bottom_size(), 1) - << "Conv Layer takes a single blob as input."; + << "Conv Layer takes a single blob as input."; CHECK_EQ(source_layer.bottom_size(), 1) - << "Padding Layer takes a single blob as input."; + << "Padding Layer takes a single blob as input."; CHECK_EQ(source_layer.top_size(), 1) - << "Padding Layer produces a single blob as output."; + << "Padding Layer produces a single blob as output."; int layer_index = param_upgraded_pad->layers_size() - 1; param_upgraded_pad->mutable_layers(layer_index)->mutable_layer() - ->set_pad(source_layer.layer().pad()); + ->set_pad(source_layer.layer().pad()); param_upgraded_pad->mutable_layers(layer_index) - ->set_bottom(j, source_layer.bottom(0)); + ->set_bottom(j, source_layer.bottom(0)); } } for (int j = 0; j < layer_connection.top_size(); ++j) { @@ -116,7 +116,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param, } bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param) { + V1LayerParameter* layer_param) { bool is_fully_compatible = true; layer_param->Clear(); for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { @@ -146,10 +146,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_num_output()) { if (type == "conv") { layer_param->mutable_convolution_param()->set_num_output( - v0_layer_param.num_output()); + v0_layer_param.num_output()); } else if (type == "innerproduct") { layer_param->mutable_inner_product_param()->set_num_output( - v0_layer_param.num_output()); + v0_layer_param.num_output()); } else { LOG(ERROR) << "Unknown parameter num_output for layer type " << type; is_fully_compatible = false; @@ -158,10 +158,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_biasterm()) { if (type == "conv") { layer_param->mutable_convolution_param()->set_bias_term( - v0_layer_param.biasterm()); + v0_layer_param.biasterm()); } else if (type == "innerproduct") { layer_param->mutable_inner_product_param()->set_bias_term( - v0_layer_param.biasterm()); + v0_layer_param.biasterm()); } else { LOG(ERROR) << "Unknown parameter biasterm for layer type " << type; is_fully_compatible = false; @@ -170,10 +170,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_weight_filler()) { if (type == "conv") { layer_param->mutable_convolution_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); } else if (type == "innerproduct") { layer_param->mutable_inner_product_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); } else { LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; is_fully_compatible = false; @@ -182,10 +182,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_bias_filler()) { if (type == "conv") { layer_param->mutable_convolution_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); } else if (type == "innerproduct") { layer_param->mutable_inner_product_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); } else { LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; is_fully_compatible = false; @@ -204,10 +204,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_kernelsize()) { if (type == "conv") { layer_param->mutable_convolution_param()->set_kernel_size( - v0_layer_param.kernelsize()); + v0_layer_param.kernelsize()); } else if (type == "pool") { layer_param->mutable_pooling_param()->set_kernel_size( - v0_layer_param.kernelsize()); + v0_layer_param.kernelsize()); } else { LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type; is_fully_compatible = false; @@ -216,7 +216,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_group()) { if (type == "conv") { layer_param->mutable_convolution_param()->set_group( - v0_layer_param.group()); + v0_layer_param.group()); } else { LOG(ERROR) << "Unknown parameter group for layer type " << type; is_fully_compatible = false; @@ -225,10 +225,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_stride()) { if (type == "conv") { layer_param->mutable_convolution_param()->set_stride( - v0_layer_param.stride()); + v0_layer_param.stride()); } else if (type == "pool") { layer_param->mutable_pooling_param()->set_stride( - v0_layer_param.stride()); + v0_layer_param.stride()); } else { LOG(ERROR) << "Unknown parameter stride for layer type " << type; is_fully_compatible = false; @@ -240,15 +240,15 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, switch (pool) { case V0LayerParameter_PoolMethod_MAX: layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); + PoolingParameter_PoolMethod_MAX); break; case V0LayerParameter_PoolMethod_AVE: layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); + PoolingParameter_PoolMethod_AVE); break; case V0LayerParameter_PoolMethod_STOCHASTIC: layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); + PoolingParameter_PoolMethod_STOCHASTIC); break; default: LOG(ERROR) << "Unknown pool method " << pool; @@ -262,7 +262,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_dropout_ratio()) { if (type == "dropout") { layer_param->mutable_dropout_param()->set_dropout_ratio( - v0_layer_param.dropout_ratio()); + v0_layer_param.dropout_ratio()); } else { LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type; is_fully_compatible = false; @@ -271,7 +271,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_local_size()) { if (type == "lrn") { layer_param->mutable_lrn_param()->set_local_size( - v0_layer_param.local_size()); + v0_layer_param.local_size()); } else { LOG(ERROR) << "Unknown parameter local_size for layer type " << type; is_fully_compatible = false; @@ -306,16 +306,16 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, layer_param->mutable_data_param()->set_source(v0_layer_param.source()); } else if (type == "hdf5_data") { layer_param->mutable_hdf5_data_param()->set_source( - v0_layer_param.source()); + v0_layer_param.source()); } else if (type == "images") { layer_param->mutable_image_data_param()->set_source( - v0_layer_param.source()); + v0_layer_param.source()); } else if (type == "window_data") { layer_param->mutable_window_data_param()->set_source( - v0_layer_param.source()); + v0_layer_param.source()); } else if (type == "infogain_loss") { layer_param->mutable_infogain_loss_param()->set_source( - v0_layer_param.source()); + v0_layer_param.source()); } else { LOG(ERROR) << "Unknown parameter source for layer type " << type; is_fully_compatible = false; @@ -323,25 +323,25 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } if (v0_layer_param.has_scale()) { layer_param->mutable_transform_param()-> - set_scale(v0_layer_param.scale()); + set_scale(v0_layer_param.scale()); } if (v0_layer_param.has_meanfile()) { layer_param->mutable_transform_param()-> - set_mean_file(v0_layer_param.meanfile()); + set_mean_file(v0_layer_param.meanfile()); } if (v0_layer_param.has_batchsize()) { if (type == "data") { layer_param->mutable_data_param()->set_batch_size( - v0_layer_param.batchsize()); + v0_layer_param.batchsize()); } else if (type == "hdf5_data") { layer_param->mutable_hdf5_data_param()->set_batch_size( - v0_layer_param.batchsize()); + v0_layer_param.batchsize()); } else if (type == "images") { layer_param->mutable_image_data_param()->set_batch_size( - v0_layer_param.batchsize()); + v0_layer_param.batchsize()); } else if (type == "window_data") { layer_param->mutable_window_data_param()->set_batch_size( - v0_layer_param.batchsize()); + v0_layer_param.batchsize()); } else { LOG(ERROR) << "Unknown parameter batchsize for layer type " << type; is_fully_compatible = false; @@ -349,19 +349,19 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } if (v0_layer_param.has_cropsize()) { layer_param->mutable_transform_param()-> - set_crop_size(v0_layer_param.cropsize()); + set_crop_size(v0_layer_param.cropsize()); } if (v0_layer_param.has_mirror()) { layer_param->mutable_transform_param()-> - set_mirror(v0_layer_param.mirror()); + set_mirror(v0_layer_param.mirror()); } if (v0_layer_param.has_rand_skip()) { if (type == "data") { layer_param->mutable_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); + v0_layer_param.rand_skip()); } else if (type == "images") { layer_param->mutable_image_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); + v0_layer_param.rand_skip()); } else { LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type; is_fully_compatible = false; @@ -370,7 +370,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_shuffle_images()) { if (type == "images") { layer_param->mutable_image_data_param()->set_shuffle( - v0_layer_param.shuffle_images()); + v0_layer_param.shuffle_images()); } else { LOG(ERROR) << "Unknown parameter shuffle for layer type " << type; is_fully_compatible = false; @@ -379,7 +379,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_new_height()) { if (type == "images") { layer_param->mutable_image_data_param()->set_new_height( - v0_layer_param.new_height()); + v0_layer_param.new_height()); } else { LOG(ERROR) << "Unknown parameter new_height for layer type " << type; is_fully_compatible = false; @@ -388,7 +388,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_new_width()) { if (type == "images") { layer_param->mutable_image_data_param()->set_new_width( - v0_layer_param.new_width()); + v0_layer_param.new_width()); } else { LOG(ERROR) << "Unknown parameter new_width for layer type " << type; is_fully_compatible = false; @@ -397,7 +397,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_concat_dim()) { if (type == "concat") { layer_param->mutable_concat_param()->set_concat_dim( - v0_layer_param.concat_dim()); + v0_layer_param.concat_dim()); } else { LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type; is_fully_compatible = false; @@ -406,60 +406,60 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, if (v0_layer_param.has_det_fg_threshold()) { if (type == "window_data") { layer_param->mutable_window_data_param()->set_fg_threshold( - v0_layer_param.det_fg_threshold()); + v0_layer_param.det_fg_threshold()); } else { LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " - << type; + << type; is_fully_compatible = false; } } if (v0_layer_param.has_det_bg_threshold()) { if (type == "window_data") { layer_param->mutable_window_data_param()->set_bg_threshold( - v0_layer_param.det_bg_threshold()); + v0_layer_param.det_bg_threshold()); } else { LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " - << type; + << type; is_fully_compatible = false; } } if (v0_layer_param.has_det_fg_fraction()) { if (type == "window_data") { layer_param->mutable_window_data_param()->set_fg_fraction( - v0_layer_param.det_fg_fraction()); + v0_layer_param.det_fg_fraction()); } else { LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " - << type; + << type; is_fully_compatible = false; } } if (v0_layer_param.has_det_context_pad()) { if (type == "window_data") { layer_param->mutable_window_data_param()->set_context_pad( - v0_layer_param.det_context_pad()); + v0_layer_param.det_context_pad()); } else { LOG(ERROR) << "Unknown parameter det_context_pad for layer type " - << type; + << type; is_fully_compatible = false; } } if (v0_layer_param.has_det_crop_mode()) { if (type == "window_data") { layer_param->mutable_window_data_param()->set_crop_mode( - v0_layer_param.det_crop_mode()); + v0_layer_param.det_crop_mode()); } else { LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " - << type; + << type; is_fully_compatible = false; } } if (v0_layer_param.has_hdf5_output_param()) { if (type == "hdf5_output") { layer_param->mutable_hdf5_output_param()->CopyFrom( - v0_layer_param.hdf5_output_param()); + v0_layer_param.hdf5_output_param()); } else { LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -613,42 +613,42 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { // NetParameter was specified using the old style (V0LayerParameter); try to // upgrade it. LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V0LayerParameter: " << param_file; + << "V0LayerParameter: " << param_file; NetParameter original_param(*param); if (!UpgradeV0Net(original_param, param)) { success = false; LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V0NetParameter to NetParameter (see above); continuing anyway."; + << "V0NetParameter to NetParameter (see above); continuing anyway."; } else { LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V0LayerParameter"; + << "V0LayerParameter"; } LOG(ERROR) << "Note that future Caffe releases will not support " - << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " - << "prototxt and ./build/tools/upgrade_net_proto_binary for model " - << "weights upgrade this and any other net protos to the new format."; + << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " + << "prototxt and ./build/tools/upgrade_net_proto_binary for model " + << "weights upgrade this and any other net protos to the new format."; } // NetParameter uses old style data transformation fields; try to upgrade it. if (NetNeedsDataUpgrade(*param)) { LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "transformation parameters: " << param_file; + << "transformation parameters: " << param_file; UpgradeNetDataTransformation(param); LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "data transformation parameters."; + << "data transformation parameters."; LOG(ERROR) << "Note that future Caffe releases will only support " - << "transform_param messages for transformation fields."; + << "transform_param messages for transformation fields."; } if (NetNeedsV1ToV2Upgrade(*param)) { LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V1LayerParameter: " << param_file; + << "V1LayerParameter: " << param_file; NetParameter original_param(*param); if (!UpgradeV1Net(original_param, param)) { success = false; LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V1LayerParameter (see above); continuing anyway."; + << "V1LayerParameter (see above); continuing anyway."; } else { LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V1LayerParameter"; + << "V1LayerParameter"; } } return success; @@ -658,7 +658,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { bool is_fully_compatible = true; if (v1_net_param.layer_size() > 0) { LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " - << "fields; these will be ignored for the upgrade."; + << "fields; these will be ignored for the upgrade."; is_fully_compatible = false; } net_param->CopyFrom(v1_net_param); @@ -666,7 +666,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { net_param->clear_layer(); for (int i = 0; i < v1_net_param.layers_size(); ++i) { if (!UpgradeV1LayerParameter(v1_net_param.layers(i), - net_param->add_layer())) { + net_param->add_layer())) { LOG(ERROR) << "Upgrade of input layer " << i << " failed."; is_fully_compatible = false; } @@ -675,7 +675,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { } bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param) { + LayerParameter* layer_param) { layer_param->Clear(); bool is_fully_compatible = true; for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { @@ -719,7 +719,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, break; default: LOG(FATAL) << "Unknown blob_share_mode: " - << v1_layer_param.blob_share_mode(i); + << v1_layer_param.blob_share_mode(i); break; } layer_param->mutable_param(i)->set_share_mode(mode); @@ -735,130 +735,130 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, layer_param->add_param(); } layer_param->mutable_param(i)->set_decay_mult( - v1_layer_param.weight_decay(i)); + v1_layer_param.weight_decay(i)); } for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) { layer_param->add_loss_weight(v1_layer_param.loss_weight(i)); } if (v1_layer_param.has_accuracy_param()) { layer_param->mutable_accuracy_param()->CopyFrom( - v1_layer_param.accuracy_param()); + v1_layer_param.accuracy_param()); } if (v1_layer_param.has_argmax_param()) { layer_param->mutable_argmax_param()->CopyFrom( - v1_layer_param.argmax_param()); + v1_layer_param.argmax_param()); } if (v1_layer_param.has_concat_param()) { layer_param->mutable_concat_param()->CopyFrom( - v1_layer_param.concat_param()); + v1_layer_param.concat_param()); } if (v1_layer_param.has_contrastive_loss_param()) { layer_param->mutable_contrastive_loss_param()->CopyFrom( - v1_layer_param.contrastive_loss_param()); + v1_layer_param.contrastive_loss_param()); } if (v1_layer_param.has_convolution_param()) { layer_param->mutable_convolution_param()->CopyFrom( - v1_layer_param.convolution_param()); + v1_layer_param.convolution_param()); } if (v1_layer_param.has_data_param()) { layer_param->mutable_data_param()->CopyFrom( - v1_layer_param.data_param()); + v1_layer_param.data_param()); } if (v1_layer_param.has_dropout_param()) { layer_param->mutable_dropout_param()->CopyFrom( - v1_layer_param.dropout_param()); + v1_layer_param.dropout_param()); } if (v1_layer_param.has_dummy_data_param()) { layer_param->mutable_dummy_data_param()->CopyFrom( - v1_layer_param.dummy_data_param()); + v1_layer_param.dummy_data_param()); } if (v1_layer_param.has_eltwise_param()) { layer_param->mutable_eltwise_param()->CopyFrom( - v1_layer_param.eltwise_param()); + v1_layer_param.eltwise_param()); } if (v1_layer_param.has_exp_param()) { layer_param->mutable_exp_param()->CopyFrom( - v1_layer_param.exp_param()); + v1_layer_param.exp_param()); } if (v1_layer_param.has_hdf5_data_param()) { layer_param->mutable_hdf5_data_param()->CopyFrom( - v1_layer_param.hdf5_data_param()); + v1_layer_param.hdf5_data_param()); } if (v1_layer_param.has_hdf5_output_param()) { layer_param->mutable_hdf5_output_param()->CopyFrom( - v1_layer_param.hdf5_output_param()); + v1_layer_param.hdf5_output_param()); } if (v1_layer_param.has_hinge_loss_param()) { layer_param->mutable_hinge_loss_param()->CopyFrom( - v1_layer_param.hinge_loss_param()); + v1_layer_param.hinge_loss_param()); } if (v1_layer_param.has_image_data_param()) { layer_param->mutable_image_data_param()->CopyFrom( - v1_layer_param.image_data_param()); + v1_layer_param.image_data_param()); } if (v1_layer_param.has_infogain_loss_param()) { layer_param->mutable_infogain_loss_param()->CopyFrom( - v1_layer_param.infogain_loss_param()); + v1_layer_param.infogain_loss_param()); } if (v1_layer_param.has_inner_product_param()) { layer_param->mutable_inner_product_param()->CopyFrom( - v1_layer_param.inner_product_param()); + v1_layer_param.inner_product_param()); } if (v1_layer_param.has_lrn_param()) { layer_param->mutable_lrn_param()->CopyFrom( - v1_layer_param.lrn_param()); + v1_layer_param.lrn_param()); } if (v1_layer_param.has_memory_data_param()) { layer_param->mutable_memory_data_param()->CopyFrom( - v1_layer_param.memory_data_param()); + v1_layer_param.memory_data_param()); } if (v1_layer_param.has_mvn_param()) { layer_param->mutable_mvn_param()->CopyFrom( - v1_layer_param.mvn_param()); + v1_layer_param.mvn_param()); } if (v1_layer_param.has_pooling_param()) { layer_param->mutable_pooling_param()->CopyFrom( - v1_layer_param.pooling_param()); + v1_layer_param.pooling_param()); } if (v1_layer_param.has_power_param()) { layer_param->mutable_power_param()->CopyFrom( - v1_layer_param.power_param()); + v1_layer_param.power_param()); } if (v1_layer_param.has_relu_param()) { layer_param->mutable_relu_param()->CopyFrom( - v1_layer_param.relu_param()); + v1_layer_param.relu_param()); } if (v1_layer_param.has_sigmoid_param()) { layer_param->mutable_sigmoid_param()->CopyFrom( - v1_layer_param.sigmoid_param()); + v1_layer_param.sigmoid_param()); } if (v1_layer_param.has_softmax_param()) { layer_param->mutable_softmax_param()->CopyFrom( - v1_layer_param.softmax_param()); + v1_layer_param.softmax_param()); } if (v1_layer_param.has_slice_param()) { layer_param->mutable_slice_param()->CopyFrom( - v1_layer_param.slice_param()); + v1_layer_param.slice_param()); } if (v1_layer_param.has_tanh_param()) { layer_param->mutable_tanh_param()->CopyFrom( - v1_layer_param.tanh_param()); + v1_layer_param.tanh_param()); } if (v1_layer_param.has_threshold_param()) { layer_param->mutable_threshold_param()->CopyFrom( - v1_layer_param.threshold_param()); + v1_layer_param.threshold_param()); } if (v1_layer_param.has_window_data_param()) { layer_param->mutable_window_data_param()->CopyFrom( - v1_layer_param.window_data_param()); + v1_layer_param.window_data_param()); } if (v1_layer_param.has_transform_param()) { layer_param->mutable_transform_param()->CopyFrom( - v1_layer_param.transform_param()); + v1_layer_param.transform_param()); } if (v1_layer_param.has_loss_param()) { layer_param->mutable_loss_param()->CopyFrom( - v1_layer_param.loss_param()); + v1_layer_param.loss_param()); } if (v1_layer_param.has_layer()) { LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; @@ -956,16 +956,16 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) { } void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param) { + NetParameter* param) { CHECK(ReadProtoFromTextFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; + << "Failed to parse NetParameter file: " << param_file; UpgradeNetAsNeeded(param_file, param); } void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param) { + NetParameter* param) { CHECK(ReadProtoFromBinaryFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; + << "Failed to parse NetParameter file: " << param_file; UpgradeNetAsNeeded(param_file, param); } From 1e1bcd2a1be5502f2bd7c9cc3638cfd6e5b761dc Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 21:10:50 -0700 Subject: [PATCH 083/124] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8fadd98f..6f6cbd80 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL caffe is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. #Design features -All layers ported to OpenCL - -Aligned with CAFFE’s latest code + -Aligned with caffe’s latest code -Performance improvement by batched sgemm implementation for conv layer From a8cb6de9bbfdd72ee182d990ac59c391c639b76f Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 22:50:42 -0700 Subject: [PATCH 084/124] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ef4ae50d..2dbe44f9 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Training speed (Model: AlexNet) -AMD W9100 (5.2TFLOPS), 255 images per second --AMD R9 Fury((7.2TFLOPS)), 231 images per second +-AMD R9 Fury((7.2TFLOPS)), 261 images per second Recognition speed (Model: AlexNet) From d5cdc7a5edbd9ac809ae63eb2d9c59d62b461300 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 22:51:47 -0700 Subject: [PATCH 085/124] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6f6cbd80..fe91ccab 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only We will keep updating the latest performance we could achieve in this section. -* Training speed (Model: AlexNet) +* Training speed (Model: AlexNet, minibatch size 128) -AMD W9100 (5.2TFLOPS), 255 images per second - -AMD R9 Fury((7.2TFLOPS)), 231 images per second + -AMD R9 Fury((7.2TFLOPS)), 261 images per second -* Recognition speed (Model: AlexNet) +* Recognition speed (Model: AlexNet, minibatch size 128) -AMD W9100 (5.2TFLOPS), 590 images per second From 915fe5cd5a2f976b88e5149e78e658623b96f278 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 22:52:39 -0700 Subject: [PATCH 086/124] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2dbe44f9..77dae02f 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only We will keep updating the latest performance we could achieve in this section. -Training speed (Model: AlexNet) +Training speed (Model: AlexNet, minibatch size 128) -AMD W9100 (5.2TFLOPS), 255 images per second -AMD R9 Fury((7.2TFLOPS)), 261 images per second -Recognition speed (Model: AlexNet) +Recognition speed (Model: AlexNet, minibatch size 128) -AMD W9100 (5.2TFLOPS), 590 images per second From 2ea828984c628c68900afbe302533a2c12c1166f Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 22:59:48 -0700 Subject: [PATCH 087/124] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 77dae02f..6280e182 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. @@ -39,7 +39,10 @@ Recognition speed (Model: AlexNet, minibatch size 128) For more information on how to install, use or contribute to this code base, please visit our wiki page: https://github.com/amd/OpenCL-caffe/wiki -#License and support +#Support needed +We encourage the contribution and support from the community to improve it together. + +#License Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. # Caffe From ef00e37c66e8c9b7685c02f1a7da8628a31dde19 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 23:00:07 -0700 Subject: [PATCH 088/124] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6280e182..073e5515 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. From ce44b9e0194259695fc4eee3d185b15bb0cf7fd1 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 23:03:55 -0700 Subject: [PATCH 089/124] Update README.md --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index fe91ccab..c11d9e66 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL caffe is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance. +This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. @@ -11,13 +11,11 @@ OpenCL is an open standard parallel programming language that is supported by mo -Performance improvement by batched sgemm implementation for conv layer - -User can choose optimal batch number depening on H/W, image size and minibatch size + -User can choose optimal batch number depending on H/W, image size and minibatch size - -Passes unit test - - -OpenCL 2.0, 1.2 + -Supports OpenCL 2.0, 1.2 - -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users + -only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 @@ -43,7 +41,10 @@ We will keep updating the latest performance we could achieve in this section. For more information on how to install, use or contribute to this code base, please visit our wiki page: https://github.com/amd/OpenCL-caffe/wiki -#License and support +#Support needed +We encourage the contribution and support from the community to improve it together. + +#License Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. # Original Caffe information From 44f67c18725bf223f02a5d37e4794e7886535da9 Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 10 Sep 2015 14:18:35 +0800 Subject: [PATCH 090/124] Fixed the bug in kernel_channel_sum(), passed throug softmaxwithloss, validated the loss output in log file --- src/caffe/layers/softmax_loss_layer.cpp | 1 - src/caffe/solver.cpp | 4 ---- src/caffe/util/ocl_wrapper.cpp | 12 ++++++------ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 58872a72..86a0d37a 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -160,7 +160,6 @@ void SoftmaxWithLossLayer::Forward_gpu( } else { loss /= outer_num_; } - printf("loss = %f\n", loss); top[0]->mutable_cpu_data()[0] = loss; if (top.size() == 2) { top[1]->ShareData(prob_); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 0a07a218..ffb77b78 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -229,11 +229,7 @@ void Solver::Step(int iters) { int idx = (iter_ - start_iter) % average_loss; smoothed_loss += (loss - losses[idx]) / average_loss; losses[idx] = loss; - printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, - losses[idx], idx); } - printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", - smoothed_loss, average_loss, losses.size()); if (display) { LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 8eb1a981..d54fd01e 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -424,7 +424,7 @@ void kernel_channel_sum(const int num, const int channels, OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); - size_t Global_Work_Size[1] = { (size_t)(num * channels) }; + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; size_t Local_Work_Size[1] = { 256 }; OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, @@ -499,7 +499,8 @@ void SoftmaxLossForwardGPU(const int nthreads, Dtype* counts) { std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - + + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data)); OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); @@ -507,8 +508,7 @@ void SoftmaxLossForwardGPU(const int nthreads, OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK( - clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); @@ -536,6 +536,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, std::string kernel_name = "SoftmaxLossBackwardGPU" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top)); @@ -544,8 +545,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK( - clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); From f8fb6d3b159b3802fb34b947dbf9ad52b1ae44f8 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 9 Sep 2015 23:42:23 -0700 Subject: [PATCH 091/124] Update README.md --- README.md | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c11d9e66..2c9a0ef1 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,9 @@ #OpenCL caffe -This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. - -OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly. +This is an OpenCL implementation of the popular caffe DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. #Design features -All layers ported to OpenCL - - -Aligned with caffe’s latest code -Performance improvement by batched sgemm implementation for conv layer @@ -15,27 +11,27 @@ OpenCL is an open standard parallel programming language that is supported by mo -Supports OpenCL 2.0, 1.2 - -only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users + -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 -Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need. +Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future. #Performance -We will keep updating the latest performance we could achieve in this section. +We will keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved. * Training speed (Model: AlexNet, minibatch size 128) - -AMD W9100 (5.2TFLOPS), 255 images per second + -AMD W9100, 255 images per second - -AMD R9 Fury((7.2TFLOPS)), 261 images per second + -AMD R9 Fury, 261 images per second * Recognition speed (Model: AlexNet, minibatch size 128) - -AMD W9100 (5.2TFLOPS), 590 images per second + -AMD W9100, 590 images per second - -AMD R9 Fury((7.2TFLOPS)), 699 images per second + -AMD R9 Fury, 699 images per second #Wiki For more information on how to install, use or contribute to this code base, please visit our wiki page: From fe779dfec920dc703387edaafdebd28e8ff339ac Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 10 Sep 2015 15:32:06 +0800 Subject: [PATCH 092/124] Added rng_uniform rng_gaussian --- include/caffe/util/ocl_wrapper.hpp | 6 ++ src/caffe/ocl/random.cl | 107 ++++++++++++++++++++++++++++- src/caffe/util/math_functions.cpp | 4 ++ src/caffe/util/ocl_wrapper.cpp | 55 +++++++++++++++ 4 files changed, 169 insertions(+), 3 deletions(-) diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index a15b68ff..290ef30f 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -145,6 +145,12 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); +template +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup); + +template +void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V); + template void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index f5a7a4db..9fbb59d7 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -707,7 +707,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_ } template -__kernel void PRNG_threefry4x32( +__kernel void PRNG_threefry4x32_bernoulli( __global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, T inf, @@ -744,9 +744,110 @@ __kernel void PRNG_threefry4x32( } -template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); +template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); -template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); +template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); //end of the looooooong gpu_random_generator kernel +template +__kernel void PRNG_threefry4x32_uniform( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + uint nrounds, + uint numrandom +){ + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + float4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ); + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ); + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ); + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ); + randomnumber[gdx] = frnd; + } +} + +template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm); + +template +__kernel void PRNG_threefry4x32_gaussian( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + float E, + float V, + uint nrounds, + uint numrandom +){ + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey1, ukey2; + + ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx; + ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0; + + threefry4x32_ctr_t random1, random2; + + if ( gdx < numrandom ) + { + random1 = threefry4x32_R(nrounds, ctr, ukey1); + random2 = threefry4x32_R(nrounds, ctr, ukey2); + float4 frnd1; + + float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution + float r2 = (((float)random2.v[0]) / r); + float r3 = (((float)random1.v[1]) / r); + float r4 = (((float)random2.v[1]) / r); + float r5 = (((float)random1.v[2]) / r); + float r6 = (((float)random2.v[2]) / r); + float r7 = (((float)random1.v[3]) / r); + float r8 = (((float)random2.v[3]) / r); + + if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0){ + r2 += 0.0001; + r4 += 0.0001; + r6 += 0.0001; + r8 += 0.0001; + } + + frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + + randomnumber[gdx] = frnd1; + } +} + +template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm); + + diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index ed71edf6..c76703fb 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -801,20 +801,24 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) { template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, float* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object } template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, double* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object } template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, float* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object } template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index c8f28426..73417ce8 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -64,6 +64,61 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dty template void caffe_gpu_bernoulli(int* a, const unsigned int n, float inf, float sup, float threshold); template void caffe_gpu_bernoulli(int* a, const unsigned int n, double inf, double sup, double threshold); +template +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) +{ + std::string kernel_name = "RNGUniform" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup); +template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup); + +template +void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) +{ + std::string kernel_name = "RNGGaussian" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&E); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&V); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_gaussian(float* a, const unsigned int n, float E, float V); +template void caffe_gpu_gaussian(double* a, const unsigned int n, double E, double V); template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){ From e42eeaedf968d2edc90751af308a60e1fa46ebca Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 10 Sep 2015 16:37:50 +0800 Subject: [PATCH 093/124] fix a template error in random.cl --- src/caffe/ocl/random.cl | 359 +++++++++++------------------- src/caffe/util/math_functions.cpp | 1 + 2 files changed, 134 insertions(+), 226 deletions(-) diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index da6c698e..058f41d7 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -30,26 +30,23 @@ //we use the open sourced threefry's GPU implementation typedef uint uint32_t; -struct r123array4x32 { - uint32_t v[4]; -}; +struct r123array4x32 { uint32_t v[4]; }; -enum r123_enum_threefry32x4 +enum r123_enum_threefry32x4 { R_32x4_0_0 = 10, R_32x4_0_1 = 26, R_32x4_1_0 = 11, R_32x4_1_1 = 21, R_32x4_2_0 = 13, R_32x4_2_1 = 27, - R_32x4_3_0 = 23, R_32x4_3_1 = 5, - R_32x4_4_0 = 6, R_32x4_4_1 = 20, + R_32x4_3_0 = 23, R_32x4_3_1 = 5, + R_32x4_4_0 = 6, R_32x4_4_1 = 20, R_32x4_5_0 = 17, R_32x4_5_1 = 11, R_32x4_6_0 = 25, R_32x4_6_1 = 10, R_32x4_7_0 = 18, R_32x4_7_1 = 20 }; +inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); inline uint32_t RotL_32(uint32_t x, unsigned int N) - __attribute__((always_inline)); -inline uint32_t RotL_32(uint32_t x, unsigned int N) - { +{ return (x << (N & 31)) | (x >> ((32 - N) & 31)); } @@ -57,22 +54,20 @@ typedef struct r123array4x32 threefry4x32_ctr_t; typedef struct r123array4x32 threefry4x32_key_t; typedef struct r123array4x32 threefry4x32_ukey_t; -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, - threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, - threefry4x32_ctr_t in, threefry4x32_key_t k) - { - threefry4x32_ctr_t X; - uint32_t ks[4 + 1]; - int i; +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) +{ + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; ks[4] = 0x1BD11BDA; /* - for (i = 0; i < 4; i++) - { - ks[i] = k.v[i]; - X.v[i] = in.v[i]; - ks[4] ^= k.v[i]; - }*/ + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ { ks[0] = k.v[0]; X.v[0] = in.v[0]; @@ -94,711 +89,622 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; - if (Nrounds > 0) - { + if (Nrounds > 0) + { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 1) { + } if (Nrounds > 1) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 2) { + } if (Nrounds > 2) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 3) { + } if (Nrounds > 3) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 3) { + } if (Nrounds > 3) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 1; - } - if (Nrounds > 4) { + } if (Nrounds > 4) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 5) { + } if (Nrounds > 5) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 6) { + } if (Nrounds > 6) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 7) { + } if (Nrounds > 7) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 7) { + } if (Nrounds > 7) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 2; - } - if (Nrounds > 8) { + } if (Nrounds > 8) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 9) { + } if (Nrounds > 9) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 10) { + } if (Nrounds > 10) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 11) { + } if (Nrounds > 11) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 11) { + } if (Nrounds > 11) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 3; - } - if (Nrounds > 12) { + } if (Nrounds > 12) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 13) { + } if (Nrounds > 13) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 14) { + } if (Nrounds > 14) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 15) { + } if (Nrounds > 15) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 15) { + } if (Nrounds > 15) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 4; - } - if (Nrounds > 16) { + } if (Nrounds > 16) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 17) { + } if (Nrounds > 17) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 18) { + } if (Nrounds > 18) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 19) { + } if (Nrounds > 19) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 19) { + } if (Nrounds > 19) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 5; - } - if (Nrounds > 20) { + } if (Nrounds > 20) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 21) { + } if (Nrounds > 21) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 22) { + } if (Nrounds > 22) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 23) { + } if (Nrounds > 23) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 23) { + } if (Nrounds > 23) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 6; - } - if (Nrounds > 24) { + } if (Nrounds > 24) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 25) { + } if (Nrounds > 25) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 26) { + } if (Nrounds > 26) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 27) { + } if (Nrounds > 27) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 27) { + } if (Nrounds > 27) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 7; - } - if (Nrounds > 28) { + } if (Nrounds > 28) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 29) { + } if (Nrounds > 29) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 30) { + } if (Nrounds > 30) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 31) { + } if (Nrounds > 31) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 31) { + } if (Nrounds > 31) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 8; - } - if (Nrounds > 32) { + } if (Nrounds > 32) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 33) { + } if (Nrounds > 33) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 34) { + } if (Nrounds > 34) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 35) { + } if (Nrounds > 35) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 35) { + } if (Nrounds > 35) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 9; - } - if (Nrounds > 36) { + } if (Nrounds > 36) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 37) { + } if (Nrounds > 37) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 38) { + } if (Nrounds > 38) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 39) { + } if (Nrounds > 39) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 39) { + } if (Nrounds > 39) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 10; - } - if (Nrounds > 40) { + } if (Nrounds > 40) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 41) { + } if (Nrounds > 41) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 42) { + } if (Nrounds > 42) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 43) { + } if (Nrounds > 43) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 43) { + } if (Nrounds > 43) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 11; - } - if (Nrounds > 44) { + } if (Nrounds > 44) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 45) { + } if (Nrounds > 45) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 46) { + } if (Nrounds > 46) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 47) { + } if (Nrounds > 47) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 47) { + } if (Nrounds > 47) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 12; - } - if (Nrounds > 48) { + } if (Nrounds > 48) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 49) { + } if (Nrounds > 49) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 50) { + } if (Nrounds > 50) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 51) { + } if (Nrounds > 51) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 51) { + } if (Nrounds > 51) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 13; - } - if (Nrounds > 52) { + } if (Nrounds > 52) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 53) { + } if (Nrounds > 53) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 54) { + } if (Nrounds > 54) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 55) { + } if (Nrounds > 55) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 55) { + } if (Nrounds > 55) { X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; X.v[4 - 1] += 14; - } - if (Nrounds > 56) { + } if (Nrounds > 56) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 57) { + } if (Nrounds > 57) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 58) { + } if (Nrounds > 58) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 59) { + } if (Nrounds > 59) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 59) { + } if (Nrounds > 59) { X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; X.v[4 - 1] += 15; - } - if (Nrounds > 60) { + } if (Nrounds > 60) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 61) { + } if (Nrounds > 61) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 62) { + } if (Nrounds > 62) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 63) { + } if (Nrounds > 63) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 63) { + } if (Nrounds > 63) { X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; X.v[4 - 1] += 16; - } - if (Nrounds > 64) { + } if (Nrounds > 64) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_0_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_0_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 65) { + } if (Nrounds > 65) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_1_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_1_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 66) { + } if (Nrounds > 66) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_2_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_2_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 67) { + } if (Nrounds > 67) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_3_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_3_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 67) { + } if (Nrounds > 67) { X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; X.v[4 - 1] += 17; - } - if (Nrounds > 68) { + } if (Nrounds > 68) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_4_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_4_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 69) { + } if (Nrounds > 69) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_5_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_5_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 70) { + } if (Nrounds > 70) { X.v[0] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_6_0); X.v[1] ^= X.v[0]; X.v[2] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_6_1); X.v[3] ^= X.v[2]; - } - if (Nrounds > 71) { + } if (Nrounds > 71) { X.v[0] += X.v[3]; X.v[3] = RotL_32(X.v[3], R_32x4_7_0); X.v[3] ^= X.v[0]; X.v[2] += X.v[1]; X.v[1] = RotL_32(X.v[1], R_32x4_7_1); X.v[1] ^= X.v[2]; - } - if (Nrounds > 71) { + } if (Nrounds > 71) { X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; X.v[4 - 1] += 18; - } + } return X; -} +} template __kernel void PRNG_threefry4x32_bernoulli( @@ -812,31 +718,32 @@ __kernel void PRNG_threefry4x32_bernoulli( ){ size_t gdx = get_global_id(0); - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; - - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; - threefry4x32_ctr_t random4; + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - uint4 frnd; + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + threefry4x32_ctr_t random4; - randomnumber[gdx] = frnd; - } + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } } + template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); @@ -885,8 +792,8 @@ template __kernel void PRNG_threefry4x32_gaussian( __global float4 *randomnumber, threefry4x32_ctr_t ctr_i, - float E, - float V, + T E, + T V, uint nrounds, uint numrandom ){ diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index a3207f6c..3275d75c 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -901,6 +901,7 @@ void caffe_gpu_rng_uniform(const int n, const double a, const double b, template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, + float* r) { caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object } From 900beb88a042d90cdadbf52b87766282d03ab89e Mon Sep 17 00:00:00 2001 From: Yibing Date: Thu, 10 Sep 2015 20:43:47 +0800 Subject: [PATCH 094/124] Add uint random generator --- src/caffe/layers/dropout_layer.cpp | 20 +++++++++++++++++++ src/caffe/ocl/random.cl | 31 ++++++++++++++++++++++++++++++ src/caffe/util/ocl_wrapper.cpp | 27 ++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index c84c8622..de8f5607 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -77,6 +77,23 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } +#define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\ +do{ \ + int *global_mem_cpu = new int[count]; \ + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \ + CL_TRUE, 0, sizeof(int)*count, global_mem_cpu,0, NULL, NULL); \ + size_t sample_interval = count/num; \ + if(sample_interval == 0){ \ + sample_interval=1; \ + } \ + printf("%s: ", marker); \ + for(int i=0; i void DropoutLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -101,6 +118,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, } else { caffe_gpu_copy(count, bottom_data, top_data); } +CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); } template @@ -117,6 +135,8 @@ void DropoutLayer::Backward_gpu(const vector*>& top, } else { caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); } + CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); + CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff"); } } diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index 058f41d7..438931ec 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -788,6 +788,37 @@ template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_thre template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm); + +__kernel void PRNG_threefry4x32_uint_uniform( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + uint inf, + uint sup, + uint nrounds, + uint numrandom +){ + size_t gdx = get_global_id(0); + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + frnd.x = random4.v[0] % (sup - inf) + inf; + frnd.y = random4.v[1] % (sup - inf) + inf; + frnd.z = random4.v[2] % (sup - inf) + inf; + frnd.w = random4.v[3] % (sup - inf) + inf; + randomnumber[gdx] = frnd; + } +} + + template __kernel void PRNG_threefry4x32_gaussian( __global float4 *randomnumber, diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 20535868..75b69215 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -145,6 +145,33 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup); template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup); +void caffe_gpu_uniform(const unsigned int n, unsigned int *r) +{ + std::string kernel_name = "PRNG_threefry4x32_uint_uniform"; + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_uint inf = 0; + cl_uint sup = UINT_MAX; + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&r); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} + template void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) { From 4adb3d25c5379a0118d4f323394543af1832f485 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Thu, 10 Sep 2015 08:47:08 -0700 Subject: [PATCH 095/124] Update README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2c9a0ef1..faf725dd 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ #OpenCL caffe -This is an OpenCL implementation of the popular caffe DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. +This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. + +OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language is supported by more than 20 companies, enabling this DNN framework to be used on heterogeneous platforms from a variety of commercial chip manufacturers. #Design features -All layers ported to OpenCL - -Performance improvement by batched sgemm implementation for conv layer + -Performance improvement by batched implementation for conv layer based on clBLAS -User can choose optimal batch number depending on H/W, image size and minibatch size @@ -13,7 +15,7 @@ This is an OpenCL implementation of the popular caffe DNN framework (https://git -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users - -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19 + -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19 Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future. @@ -38,10 +40,10 @@ For more information on how to install, use or contribute to this code base, ple https://github.com/amd/OpenCL-caffe/wiki #Support needed -We encourage the contribution and support from the community to improve it together. + As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together. #License -Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. +Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. # Original Caffe information ## Caffe From d2a24e6815e1cdd87d9d969e0c9c4aff57f8cb31 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Thu, 10 Sep 2015 08:51:09 -0700 Subject: [PATCH 096/124] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index faf725dd..dd3933e6 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. -OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language is supported by more than 20 companies, enabling this DNN framework to be used on heterogeneous platforms from a variety of commercial chip manufacturers. +OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. #Design features - -All layers ported to OpenCL + -All caffe layers ported to OpenCL -Performance improvement by batched implementation for conv layer based on clBLAS @@ -37,13 +37,13 @@ We will keep updating the latest performance as we make optimizations. Fury resu #Wiki For more information on how to install, use or contribute to this code base, please visit our wiki page: -https://github.com/amd/OpenCL-caffe/wiki + https://github.com/amd/OpenCL-caffe/wiki #Support needed As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together. #License -Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence. +Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license. # Original Caffe information ## Caffe From 280f8139dce78e5c239ad32569199934bb88d562 Mon Sep 17 00:00:00 2001 From: Junli Date: Thu, 10 Sep 2015 16:20:04 -0700 Subject: [PATCH 097/124] add notation --- src/caffe/layers/absval_layer.cpp | 1 + src/caffe/layers/base_conv_layer.cpp | 114 ++++++++++---------- src/caffe/layers/bnll_layer.cpp | 1 + src/caffe/layers/concat_layer.cpp | 1 + src/caffe/layers/contrastive_loss_layer.cpp | 1 + src/caffe/layers/conv_layer.cpp | 2 + src/caffe/layers/dropout_layer.cpp | 4 +- src/caffe/layers/eltwise_layer.cpp | 1 + src/caffe/layers/euclidean_loss_layer.cpp | 1 + src/caffe/layers/exp_layer.cpp | 1 + src/caffe/layers/filter_layer.cpp | 1 + src/caffe/layers/hdf5_data_layer.cpp | 1 + src/caffe/layers/hdf5_output_layer.cpp | 1 + src/caffe/layers/pooling_layer.cpp | 2 + src/caffe/layers/power_layer.cpp | 3 +- src/caffe/layers/softmax_layer.cpp | 4 +- src/caffe/layers/softmax_loss_layer.cpp | 3 +- src/caffe/layers/split_layer.cpp | 4 +- src/caffe/syncedmem.cpp | 8 +- 19 files changed, 86 insertions(+), 68 deletions(-) diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 85faa8d3..5dc99b75 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -35,6 +35,7 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void AbsValLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index cefa8a66..149b1a21 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -295,6 +295,65 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, } } +template +void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, + const Dtype* bias) { + caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, + height_out_ * width_out_, 1, (Dtype) 1., bias, 0, + reinterpret_cast(bias_multiplier_.gpu_data()), 0, + (Dtype) 1., output, top_offset_); +} + +template +void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, + const Dtype* weights, Dtype* input) { + Dtype* col_buff = col_buffer_.mutable_gpu_data(); + if (is_1x1_) { + col_buff = input; + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, + (Dtype) 1., weights, weight_offset_ * g, + output, top_offset_ + output_offset_ * g, + (Dtype) 0., col_buff, col_offset_ * g); + } + if (!is_1x1_) { + conv_col2im_gpu(col_buff, input); + } +} + + +template +void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + if (!is_1x1_) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + col_buff = col_buffer_.gpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, + (Dtype) 1., output, top_offset_, + (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., + (Dtype*) weights, weight_offset_ * g); + } +} + +template +void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, + const Dtype* input) { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num_output_, N_, + (Dtype) 1., input, top_offset_, N_, + reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, + bias, (size_t) 0, 1); +} + +// begin: code written/modified by AMD template void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { @@ -335,14 +394,6 @@ void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, opt_num2); } -template -void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, - const Dtype* bias) { - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype) 1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype) 1., output, top_offset_); -} template void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, @@ -354,25 +405,6 @@ void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, (Dtype) 1., output, top_offset_ + num_output_ * N_ * z); } -template -void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { - Dtype* col_buff = col_buffer_.mutable_gpu_data(); - if (is_1x1_) { - col_buff = input; - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ - / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype) 1., weights, weight_offset_ * g, - output, top_offset_ + output_offset_ * g, - (Dtype) 0., col_buff, col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_gpu(col_buff, input); - } -} template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, @@ -412,23 +444,6 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, } } -template -void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; - if (!is_1x1_) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - col_buff = col_buffer_.gpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ - / group_, kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype) 1., output, top_offset_, - (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., - (Dtype*) weights, weight_offset_ * g); - } -} template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* output, Dtype* weights) { @@ -463,16 +478,7 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, } } -template -void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, - const Dtype* input) { - caffe_gpu_gemv < Dtype - > (CblasNoTrans, num_output_, N_, - (Dtype) 1., input, top_offset_, N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, - bias, (size_t) 0, 1); -} - +// end: code is written/modified by AMD #endif // !CPU_ONLY INSTANTIATE_CLASS (BaseConvolutionLayer); diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 11b78a15..ad422a11 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -39,6 +39,7 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void BNLLLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 7d55ef40..28aac6b2 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -92,6 +92,7 @@ void ConcatLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void ConcatLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 6a91fdfd..f6265726 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -111,6 +111,7 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void ContrastiveLossLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index bbe07f37..0a989f69 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -69,6 +69,7 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } +// begin: code written/modified by AMD template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -234,6 +235,7 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } } +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(ConvolutionLayer); diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index de8f5607..6692f238 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -94,6 +94,7 @@ do{ \ delete []global_mem_cpu; \ }while(0) +// begin: code is written/modified by AMD template void DropoutLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -116,6 +117,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, top_data); #endif } else { + if(bottom_data != top_data) caffe_gpu_copy(count, bottom_data, top_data); } CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); @@ -139,7 +141,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff"); } } - +// end: code is written/modified by AMD #ifdef CPU_ONLY STUB_GPU(DropoutLayer); #endif diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index e7b97b0d..b904ad39 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -153,6 +153,7 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void EltwiseLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 56dc48ec..9107f119 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -47,6 +47,7 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index bf783786..087da677 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -61,6 +61,7 @@ void ExpLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void ExpLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index f7096a09..05dc2783 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -117,6 +117,7 @@ void FilterLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void FilterLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 6f67dc06..6c6d8dec 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -158,6 +158,7 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, } } +// begin: code written/modified by AMD template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index baad0dea..a8c062bc 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -67,6 +67,7 @@ void HDF5OutputLayer::Backward_cpu(const vector*>& top, return; } +// begin: code written/modified by AMD template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 92c71582..47830228 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -309,6 +309,7 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void PoolingLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -407,6 +408,7 @@ void PoolingLayer::Backward_gpu(const vector*>& top, } } +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(PoolingLayer); #endif diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 93ef9e1f..0cf82c35 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -96,6 +96,7 @@ void PowerLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void PowerLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -168,7 +169,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } } - +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(PowerLayer); #endif diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index d4cab577..feb15321 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -91,7 +91,7 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } - +// begin: code written/modified by AMD template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -148,7 +148,7 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); } - +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(SoftmaxLayer); #endif diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 86a0d37a..6b9e9e67 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -133,6 +133,7 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } +// begin: code written/modified by AMD template void SoftmaxWithLossLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { @@ -198,7 +199,7 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, } } } - +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(SoftmaxWithLossLayer); #endif diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 8b19d293..54bea0d6 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -61,6 +61,7 @@ void SplitLayer::Forward_gpu(const vector*>& bottom, } } +// begin: code written/modified by AMD template void SplitLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { @@ -79,9 +80,8 @@ void SplitLayer::Backward_gpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); } - } - +// end: code written/modified by AMD #ifdef CPU_ONLY STUB_GPU(SplitLayer); #endif diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 67f5984b..976130bf 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -52,6 +52,7 @@ SyncedMemory::~SyncedMemory() { clReleaseKernel (oclmem_kernel); } +//begin: code written/modified by AMD. void SyncedMemory::ocl_setup() { cl_int err = 0; oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); @@ -63,7 +64,6 @@ inline void SyncedMemory::to_cpu() { case UNINITIALIZED: gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); - //} cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, @@ -89,9 +89,6 @@ inline void SyncedMemory::to_cpu() { head_ = SYNCED; #else NO_GPU; -#endif -#ifdef Track_data_transfer - LOG(WARNING) << "sync: data from GPU to CPU"; #endif break; } @@ -130,9 +127,6 @@ inline void SyncedMemory::to_gpu() { (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); clFinish(amdDevice.CommandQueue); head_ = SYNCED; -#ifdef Track_data_transfer - LOG(WARNING) << "sync: data from CPU to GPU"; -#endif break; } case HEAD_AT_GPU: From 30d5f21c70d16f446cdcae233683789cc996e022 Mon Sep 17 00:00:00 2001 From: Noplz Date: Fri, 11 Sep 2015 13:59:17 +0800 Subject: [PATCH 098/124] Adjust the indent --- include/caffe/blob.hpp | 501 ++- include/caffe/common.hpp | 128 +- include/caffe/common_layers.hpp | 1113 ++++--- include/caffe/data_layers.hpp | 636 ++-- include/caffe/data_transformer.hpp | 256 +- include/caffe/device.hpp | 77 +- include/caffe/filler.hpp | 348 +- include/caffe/internal_thread.hpp | 33 +- include/caffe/layer.hpp | 850 +++-- include/caffe/layer_factory.hpp | 102 +- include/caffe/loss_layers.hpp | 1094 +++---- include/caffe/net.hpp | 484 +-- include/caffe/neuron_layers.hpp | 1342 ++++---- include/caffe/python_layer.hpp | 95 +- include/caffe/solver.hpp | 288 +- include/caffe/syncedmem.hpp | 98 +- include/caffe/test/test_caffe_main.hpp | 39 +- .../caffe/test/test_gradient_check_util.hpp | 438 +-- include/caffe/util/benchmark.hpp | 80 +- include/caffe/util/cudnn.hpp | 212 +- include/caffe/util/db.hpp | 66 +- include/caffe/util/db_leveldb.hpp | 131 +- include/caffe/util/db_lmdb.hpp | 149 +- include/caffe/util/im2col.hpp | 84 +- include/caffe/util/insert_splits.hpp | 8 +- include/caffe/util/io.hpp | 116 +- include/caffe/util/math_functions.hpp | 79 +- include/caffe/util/mkl_alternate.hpp | 14 +- include/caffe/util/ocl_util.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 246 +- include/caffe/util/rng.hpp | 29 +- include/caffe/util/upgrade_proto.hpp | 10 +- include/caffe/vision_layers.hpp | 961 +++--- src/caffe/blob.cpp | 650 ++-- src/caffe/common.cpp | 147 +- src/caffe/data_transformer.cpp | 928 +++--- src/caffe/device.cpp | 712 ++-- src/caffe/internal_thread.cpp | 40 +- src/caffe/layer_factory.cpp | 179 +- src/caffe/layers/absval_layer.cpp | 62 +- src/caffe/layers/accuracy_layer.cpp | 124 +- src/caffe/layers/argmax_layer.cpp | 73 +- src/caffe/layers/base_conv_layer.cpp | 721 ++-- src/caffe/layers/base_data_layer.cpp | 150 +- src/caffe/layers/bnll_layer.cpp | 76 +- src/caffe/layers/concat_layer.cpp | 214 +- src/caffe/layers/contrastive_loss_layer.cpp | 278 +- src/caffe/layers/conv_layer.cpp | 369 +-- src/caffe/layers/data_layer.cpp | 176 +- src/caffe/layers/deconv_layer.cpp | 194 +- src/caffe/layers/dropout_layer.cpp | 161 +- src/caffe/layers/dummy_data_layer.cpp | 189 +- src/caffe/layers/eltwise_layer.cpp | 420 +-- src/caffe/layers/euclidean_loss_layer.cpp | 98 +- src/caffe/layers/exp_layer.cpp | 134 +- src/caffe/layers/filter_layer.cpp | 286 +- src/caffe/layers/flatten_layer.cpp | 40 +- src/caffe/layers/hdf5_data_layer.cpp | 311 +- src/caffe/layers/hdf5_output_layer.cpp | 122 +- src/caffe/layers/hinge_loss_layer.cpp | 110 +- src/caffe/layers/im2col_layer.cpp | 158 +- src/caffe/layers/image_data_layer.cpp | 243 +- src/caffe/layers/infogain_loss_layer.cpp | 159 +- src/caffe/layers/inner_product_layer.cpp | 257 +- src/caffe/layers/log_layer.cpp | 198 +- src/caffe/layers/loss_layer.cpp | 24 +- src/caffe/layers/lrn_layer.cpp | 509 +-- src/caffe/layers/memory_data_layer.cpp | 166 +- .../multinomial_logistic_loss_layer.cpp | 77 +- src/caffe/layers/mvn_layer.cpp | 424 ++- src/caffe/layers/neuron_layer.cpp | 4 +- src/caffe/layers/pooling_layer.cpp | 736 ++--- src/caffe/layers/power_layer.cpp | 277 +- src/caffe/layers/prelu_layer.cpp | 337 +- src/caffe/layers/reduction_layer.cpp | 352 +- src/caffe/layers/relu_layer.cpp | 74 +- src/caffe/layers/reshape_layer.cpp | 154 +- .../sigmoid_cross_entropy_loss_layer.cpp | 131 +- src/caffe/layers/sigmoid_layer.cpp | 69 +- src/caffe/layers/silence_layer.cpp | 31 +- src/caffe/layers/slice_layer.cpp | 181 +- src/caffe/layers/softmax_layer.cpp | 226 +- src/caffe/layers/softmax_loss_layer.cpp | 314 +- src/caffe/layers/split_layer.cpp | 110 +- src/caffe/layers/spp_layer.cpp | 303 +- src/caffe/layers/tanh_layer.cpp | 69 +- src/caffe/layers/threshold_layer.cpp | 32 +- src/caffe/layers/window_data_layer.cpp | 768 +++-- src/caffe/net.cpp | 1507 +++++---- src/caffe/ocl/bnll_layer.cl | 24 +- src/caffe/ocl/concat_layer.cl | 44 +- src/caffe/ocl/contrastive_loss_layer.cl | 64 +- src/caffe/ocl/dropout_layer.cl | 12 +- src/caffe/ocl/eltwise_layer.cl | 72 +- src/caffe/ocl/im2col.cl | 398 +-- src/caffe/ocl/lrn_layer.cl | 190 +- src/caffe/ocl/pooling_layer.cl | 446 +-- src/caffe/ocl/prelu_layer.cl | 34 +- src/caffe/ocl/random.cl | 1720 +++++----- src/caffe/ocl/relu_layer.cl | 14 +- src/caffe/ocl/sigmoid_layer.cl | 14 +- src/caffe/ocl/softmax_layer.cl | 192 +- src/caffe/ocl/softmaxwithloss_layer.cl | 112 +- src/caffe/ocl/tanh_layer.cl | 14 +- src/caffe/ocl/threshold_layer.cl | 6 +- src/caffe/ocl/util.cl | 160 +- src/caffe/solver.cpp | 1309 ++++---- src/caffe/syncedmem.cpp | 207 +- src/caffe/util/benchmark.cpp | 141 +- src/caffe/util/cudnn.cpp | 28 +- src/caffe/util/db.cpp | 30 +- src/caffe/util/db_leveldb.cpp | 20 +- src/caffe/util/db_lmdb.cpp | 54 +- src/caffe/util/im2col.cpp | 570 ++-- src/caffe/util/im2col.cu | 197 +- src/caffe/util/insert_splits.cpp | 241 +- src/caffe/util/io.cpp | 414 ++- src/caffe/util/math_functions.cpp | 948 +++--- src/caffe/util/math_functions.cu | 209 +- src/caffe/util/ocl_util.cpp | 72 +- src/caffe/util/ocl_wrapper.cpp | 2903 ++++++++--------- src/caffe/util/upgrade_proto.cpp | 1761 +++++----- 122 files changed, 18665 insertions(+), 18918 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 26a75558..9f22a082 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -23,277 +23,276 @@ namespace caffe { */ template class Blob { - public: - Blob() - : - data_(), diff_(), count_(0), capacity_(0) { - } + public: + Blob() + : data_(), diff_(), count_(0), capacity_(0) { + } - /// @brief Deprecated; use Blob(const vector& shape). - explicit Blob(const int num, const int channels, const int height, - const int width); - explicit Blob(const vector& shape); + /// @brief Deprecated; use Blob(const vector& shape). + explicit Blob(const int num, const int channels, const int height, + const int width); + explicit Blob(const vector& shape); - /// @brief Deprecated; use Reshape(const vector& shape). - void Reshape(const int num, const int channels, const int height, - const int width); - /** - * @brief Change the dimensions of the blob, allocating new memory if - * necessary. - * - * This function can be called both to create an initial allocation - * of memory, and to adjust the dimensions of a top blob during Layer::Reshape - * or Layer::Forward. When changing the size of blob, memory will only be - * reallocated if sufficient memory does not already exist, and excess memory - * will never be freed. - * - * Note that reshaping an input blob and immediately calling Net::Backward is - * an error; either Net::Forward or Net::Reshape need to be called to - * propagate the new input shape to higher layers. - */ - void Reshape(const vector& shape); - void Reshape(const BlobShape& shape); - void ReshapeLike(const Blob& other); - inline string shape_string() const { - ostringstream stream; - for (int i = 0; i < shape_.size(); ++i) { - stream << shape_[i] << " "; - } - stream << "(" << count_ << ")"; - return stream.str(); - } - inline const vector& shape() const { - return shape_; - } - /** - * @brief Returns the dimension of the index-th axis (or the negative index-th - * axis from the end, if index is negative). - * - * @param index the axis index, which may be negative as it will be - * "canonicalized" using CanonicalAxisIndex. - * Dies on out of range index. - */ - inline int shape(int index) const { - return shape_[CanonicalAxisIndex(index)]; - } - inline int num_axes() const { - return shape_.size(); - } - inline int count() const { - return count_; - } + /// @brief Deprecated; use Reshape(const vector& shape). + void Reshape(const int num, const int channels, const int height, + const int width); + /** + * @brief Change the dimensions of the blob, allocating new memory if + * necessary. + * + * This function can be called both to create an initial allocation + * of memory, and to adjust the dimensions of a top blob during Layer::Reshape + * or Layer::Forward. When changing the size of blob, memory will only be + * reallocated if sufficient memory does not already exist, and excess memory + * will never be freed. + * + * Note that reshaping an input blob and immediately calling Net::Backward is + * an error; either Net::Forward or Net::Reshape need to be called to + * propagate the new input shape to higher layers. + */ + void Reshape(const vector& shape); + void Reshape(const BlobShape& shape); + void ReshapeLike(const Blob& other); + inline string shape_string() const { + ostringstream stream; + for (int i = 0; i < shape_.size(); ++i) { + stream << shape_[i] << " "; + } + stream << "(" << count_ << ")"; + return stream.str(); + } + inline const vector& shape() const { + return shape_; + } + /** + * @brief Returns the dimension of the index-th axis (or the negative index-th + * axis from the end, if index is negative). + * + * @param index the axis index, which may be negative as it will be + * "canonicalized" using CanonicalAxisIndex. + * Dies on out of range index. + */ + inline int shape(int index) const { + return shape_[CanonicalAxisIndex(index)]; + } + inline int num_axes() const { + return shape_.size(); + } + inline int count() const { + return count_; + } - /** - * @brief Compute the volume of a slice; i.e., the product of dimensions - * among a range of axes. - * - * @param start_axis The first axis to include in the slice. - * - * @param end_axis The first axis to exclude from the slice. - */ - inline int count(int start_axis, int end_axis) const { - CHECK_LE(start_axis, end_axis); - CHECK_GE(start_axis, 0); - CHECK_GE(end_axis, 0); - CHECK_LE(start_axis, num_axes()); - CHECK_LE(end_axis, num_axes()); - int count = 1; - for (int i = start_axis; i < end_axis; ++i) { - count *= shape(i); - } - return count; - } - /** - * @brief Compute the volume of a slice spanning from a particular first - * axis to the final axis. - * - * @param start_axis The first axis to include in the slice. - */ - inline int count(int start_axis) const { - return count(start_axis, num_axes()); - } + /** + * @brief Compute the volume of a slice; i.e., the product of dimensions + * among a range of axes. + * + * @param start_axis The first axis to include in the slice. + * + * @param end_axis The first axis to exclude from the slice. + */ + inline int count(int start_axis, int end_axis) const { + CHECK_LE(start_axis, end_axis); + CHECK_GE(start_axis, 0); + CHECK_GE(end_axis, 0); + CHECK_LE(start_axis, num_axes()); + CHECK_LE(end_axis, num_axes()); + int count = 1; + for (int i = start_axis; i < end_axis; ++i) { + count *= shape(i); + } + return count; + } + /** + * @brief Compute the volume of a slice spanning from a particular first + * axis to the final axis. + * + * @param start_axis The first axis to include in the slice. + */ + inline int count(int start_axis) const { + return count(start_axis, num_axes()); + } - /** - * @brief Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param index the axis index. - * If 0 <= index < num_axes(), return index. - * If -num_axes <= index <= -1, return (num_axes() - (-index)), - * e.g., the last axis index (num_axes() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int CanonicalAxisIndex(int axis_index) const { - CHECK_GE(axis_index, -num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - CHECK_LT(axis_index, num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - if (axis_index < 0) { - return axis_index + num_axes(); - } - return axis_index; - } + /** + * @brief Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param index the axis index. + * If 0 <= index < num_axes(), return index. + * If -num_axes <= index <= -1, return (num_axes() - (-index)), + * e.g., the last axis index (num_axes() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ + inline int CanonicalAxisIndex(int axis_index) const { + CHECK_GE(axis_index, -num_axes()) << "axis " << axis_index + << " out of range for " << num_axes() << "-D Blob with shape " + << shape_string(); + CHECK_LT(axis_index, num_axes()) << "axis " << axis_index + << " out of range for " << num_axes() << "-D Blob with shape " + << shape_string(); + if (axis_index < 0) { + return axis_index + num_axes(); + } + return axis_index; + } - /// @brief Deprecated legacy shape accessor num: use shape(0) instead. - inline int num() const { - return LegacyShape(0); - } - /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. - inline int channels() const { - return LegacyShape(1); - } - /// @brief Deprecated legacy shape accessor height: use shape(2) instead. - inline int height() const { - return LegacyShape(2); - } - /// @brief Deprecated legacy shape accessor width: use shape(3) instead. - inline int width() const { - return LegacyShape(3); - } - inline int LegacyShape(int index) const { - CHECK_LE(num_axes(), 4) - << "Cannot use legacy accessors on Blobs with > 4 axes."; - CHECK_LT(index, 4); - CHECK_GE(index, -4); - if (index >= num_axes() || index < -num_axes()) { - // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse - // indexing) -- this special case simulates the one-padding used to fill - // extraneous axes of legacy blobs. - return 1; - } - return shape(index); - } + /// @brief Deprecated legacy shape accessor num: use shape(0) instead. + inline int num() const { + return LegacyShape(0); + } + /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. + inline int channels() const { + return LegacyShape(1); + } + /// @brief Deprecated legacy shape accessor height: use shape(2) instead. + inline int height() const { + return LegacyShape(2); + } + /// @brief Deprecated legacy shape accessor width: use shape(3) instead. + inline int width() const { + return LegacyShape(3); + } + inline int LegacyShape(int index) const { + CHECK_LE(num_axes(), 4) + << "Cannot use legacy accessors on Blobs with > 4 axes."; + CHECK_LT(index, 4); + CHECK_GE(index, -4); + if (index >= num_axes() || index < -num_axes()) { + // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse + // indexing) -- this special case simulates the one-padding used to fill + // extraneous axes of legacy blobs. + return 1; + } + return shape(index); + } - inline int offset(const int n, const int c = 0, const int h = 0, - const int w = 0) const { - CHECK_GE(n, 0); - CHECK_LE(n, num()); - CHECK_GE(channels(), 0); - CHECK_LE(c, channels()); - CHECK_GE(height(), 0); - CHECK_LE(h, height()); - CHECK_GE(width(), 0); - CHECK_LE(w, width()); - return ((n * channels() + c) * height() + h) * width() + w; - } + inline int offset(const int n, const int c = 0, const int h = 0, + const int w = 0) const { + CHECK_GE(n, 0); + CHECK_LE(n, num()); + CHECK_GE(channels(), 0); + CHECK_LE(c, channels()); + CHECK_GE(height(), 0); + CHECK_LE(h, height()); + CHECK_GE(width(), 0); + CHECK_LE(w, width()); + return ((n * channels() + c) * height() + h) * width() + w; + } - inline int offset(const vector& indices) const { - CHECK_LE(indices.size(), num_axes()); - int offset = 0; - for (int i = 0; i < num_axes(); ++i) { - offset *= shape(i); - if (indices.size() > i) { - CHECK_GE(indices[i], 0); - CHECK_LT(indices[i], shape(i)); - offset += indices[i]; - } - } - return offset; - } - /** - * @brief Copy from a source Blob. - * - * @param source the Blob to copy from - * @param copy_diff if false, copy the data; if true, copy the diff - * @param reshape if false, require this Blob to be pre-shaped to the shape - * of other (and die otherwise); if true, Reshape this Blob to other's - * shape if necessary - */ - void CopyFrom(const Blob& source, bool copy_diff = false, - bool reshape = false); + inline int offset(const vector& indices) const { + CHECK_LE(indices.size(), num_axes()); + int offset = 0; + for (int i = 0; i < num_axes(); ++i) { + offset *= shape(i); + if (indices.size() > i) { + CHECK_GE(indices[i], 0); + CHECK_LT(indices[i], shape(i)); + offset += indices[i]; + } + } + return offset; + } + /** + * @brief Copy from a source Blob. + * + * @param source the Blob to copy from + * @param copy_diff if false, copy the data; if true, copy the diff + * @param reshape if false, require this Blob to be pre-shaped to the shape + * of other (and die otherwise); if true, Reshape this Blob to other's + * shape if necessary + */ + void CopyFrom(const Blob& source, bool copy_diff = false, + bool reshape = false); - inline Dtype data_at(const int n, const int c, const int h, - const int w) const { - return cpu_data()[offset(n, c, h, w)]; - } + inline Dtype data_at(const int n, const int c, const int h, + const int w) const { + return cpu_data()[offset(n, c, h, w)]; + } - inline Dtype diff_at(const int n, const int c, const int h, - const int w) const { - return cpu_diff()[offset(n, c, h, w)]; - } + inline Dtype diff_at(const int n, const int c, const int h, + const int w) const { + return cpu_diff()[offset(n, c, h, w)]; + } - inline Dtype data_at(const vector& index) const { - return cpu_data()[offset(index)]; - } + inline Dtype data_at(const vector& index) const { + return cpu_data()[offset(index)]; + } - inline Dtype diff_at(const vector& index) const { - return cpu_diff()[offset(index)]; - } + inline Dtype diff_at(const vector& index) const { + return cpu_diff()[offset(index)]; + } - inline const shared_ptr& data() const { - CHECK(data_); - return data_; - } + inline const shared_ptr& data() const { + CHECK(data_); + return data_; + } - inline const shared_ptr& diff() const { - CHECK(diff_); - return diff_; - } + inline const shared_ptr& diff() const { + CHECK(diff_); + return diff_; + } - const Dtype* cpu_data() const; - void set_cpu_data(Dtype* data); - const Dtype* gpu_data() const; - const Dtype* gpu_cache_data() const; - const Dtype* cpu_diff() const; - const Dtype* gpu_diff() const; - Dtype* mutable_cpu_data(); - Dtype* mutable_gpu_data(); - Dtype* mutable_cpu_diff(); - Dtype* mutable_gpu_diff(); - void Update(); - void FromProto(const BlobProto& proto, bool reshape = true); - void ToProto(BlobProto* proto, bool write_diff = false) const; + const Dtype* cpu_data() const; + void set_cpu_data(Dtype* data); + const Dtype* gpu_data() const; + const Dtype* gpu_cache_data() const; + const Dtype* cpu_diff() const; + const Dtype* gpu_diff() const; + Dtype* mutable_cpu_data(); + Dtype* mutable_gpu_data(); + Dtype* mutable_cpu_diff(); + Dtype* mutable_gpu_diff(); + void Update(); + void FromProto(const BlobProto& proto, bool reshape = true); + void ToProto(BlobProto* proto, bool write_diff = false) const; - /// @brief Compute the sum of absolute values (L1 norm) of the data. - Dtype asum_data() const; - /// @brief Compute the sum of absolute values (L1 norm) of the diff. - Dtype asum_diff() const; - /// @brief Compute the sum of squares (L2 norm squared) of the data. - Dtype sumsq_data() const; - /// @brief Compute the sum of squares (L2 norm squared) of the diff. - Dtype sumsq_diff() const; + /// @brief Compute the sum of absolute values (L1 norm) of the data. + Dtype asum_data() const; + /// @brief Compute the sum of absolute values (L1 norm) of the diff. + Dtype asum_diff() const; + /// @brief Compute the sum of squares (L2 norm squared) of the data. + Dtype sumsq_data() const; + /// @brief Compute the sum of squares (L2 norm squared) of the diff. + Dtype sumsq_diff() const; - /// @brief Scale the blob data by a constant factor. - void scale_data(Dtype scale_factor); - /// @brief Scale the blob diff by a constant factor. - void scale_diff(Dtype scale_factor); + /// @brief Scale the blob data by a constant factor. + void scale_data(Dtype scale_factor); + /// @brief Scale the blob diff by a constant factor. + void scale_diff(Dtype scale_factor); - /** - * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the - * data_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's data_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareData(const Blob& other); - /** - * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the - * diff_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's diff_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareDiff(const Blob& other); - void set_data_layer() { - data_->set_data_layer(); - diff_->set_data_layer(); - } + /** + * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the + * data_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's data_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareData(const Blob& other); + /** + * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the + * diff_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's diff_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareDiff(const Blob& other); + void set_data_layer() { + data_->set_data_layer(); + diff_->set_data_layer(); + } - bool ShapeEquals(const BlobProto& other); + bool ShapeEquals(const BlobProto& other); - protected: - shared_ptr data_; - shared_ptr diff_; - vector shape_; - int count_; - int capacity_; + protected: + shared_ptr data_; + shared_ptr diff_; + vector shape_; + int count_; + int capacity_; - DISABLE_COPY_AND_ASSIGN (Blob); + DISABLE_COPY_AND_ASSIGN (Blob); }; // class Blob diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 0f3a7667..df99c7cf 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -185,81 +185,81 @@ void GlobalInit(int* pargc, char*** pargv); // A singleton class to hold common caffe stuff, such as the handler that // caffe is going to use for cublas, curand, etc. class Caffe { - public: - ~Caffe(); - inline static Caffe& Get() { - if (!singleton_.get()) { - singleton_.reset(new Caffe()); - } - return *singleton_; - } - enum Brew { - CPU, GPU, APU - }; + public: + ~Caffe(); + inline static Caffe& Get() { + if (!singleton_.get()) { + singleton_.reset(new Caffe()); + } + return *singleton_; + } + enum Brew { + CPU, GPU, APU + }; - // This random number generator facade hides boost and CUDA rng - // implementation from one another (for cross-platform compatibility). - class RNG { - public: - RNG(); - explicit RNG(unsigned int seed); - explicit RNG(const RNG&); - RNG& operator=(const RNG&); - void* generator(); - private: - class Generator; - shared_ptr generator_; - }; + // This random number generator facade hides boost and CUDA rng + // implementation from one another (for cross-platform compatibility). + class RNG { + public: + RNG(); + explicit RNG(unsigned int seed); + explicit RNG(const RNG&); + RNG& operator=(const RNG&); + void* generator(); + private: + class Generator; + shared_ptr generator_; + }; - // Getters for boost rng, curand, and cublas handles - inline static RNG& rng_stream() { - if (!Get().random_generator_) { - Get().random_generator_.reset(new RNG()); - } - return *(Get().random_generator_); - } + // Getters for boost rng, curand, and cublas handles + inline static RNG& rng_stream() { + if (!Get().random_generator_) { + Get().random_generator_.reset(new RNG()); + } + return *(Get().random_generator_); + } #ifndef CPU_ONLY - //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } - //inline static curandGenerator_t curand_generator() { - // return Get().curand_generator_; - //} + //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } + //inline static curandGenerator_t curand_generator() { + // return Get().curand_generator_; + //} #endif - // Returns the mode: running on CPU or GPU. - inline static Brew mode() { - return Get().mode_; - } - // The setters for the variables - // Sets the mode. It is recommended that you don't change the mode halfway - // into the program since that may cause allocation of pinned memory being - // freed in a non-pinned way, which may cause problems - I haven't verified - // it personally but better to note it here in the header file. - inline static void set_mode(Brew mode) { - Get().mode_ = mode; - } - // Sets the random seed of both boost and curand - static void set_random_seed(const unsigned int seed); - // Sets the device. Since we have cublas and curand stuff, set device also - // requires us to reset those values. - static void SetDevice(const int device_id); - // Prints the current GPU status. - static void DeviceQuery(); + // Returns the mode: running on CPU or GPU. + inline static Brew mode() { + return Get().mode_; + } + // The setters for the variables + // Sets the mode. It is recommended that you don't change the mode halfway + // into the program since that may cause allocation of pinned memory being + // freed in a non-pinned way, which may cause problems - I haven't verified + // it personally but better to note it here in the header file. + inline static void set_mode(Brew mode) { + Get().mode_ = mode; + } + // Sets the random seed of both boost and curand + static void set_random_seed(const unsigned int seed); + // Sets the device. Since we have cublas and curand stuff, set device also + // requires us to reset those values. + static void SetDevice(const int device_id); + // Prints the current GPU status. + static void DeviceQuery(); - protected: + protected: #ifndef CPU_ONLY - //cublasHandle_t cublas_handle_; - //curandGenerator_t curand_generator_; + //cublasHandle_t cublas_handle_; + //curandGenerator_t curand_generator_; #endif - shared_ptr random_generator_; + shared_ptr random_generator_; - Brew mode_; - static shared_ptr singleton_; + Brew mode_; + static shared_ptr singleton_; - private: - // The private constructor to avoid duplicate instantiation. - Caffe(); + private: + // The private constructor to avoid duplicate instantiation. + Caffe(); - DISABLE_COPY_AND_ASSIGN(Caffe); + DISABLE_COPY_AND_ASSIGN(Caffe); }; } // namespace caffe diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index d892b5b5..ab796286 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -27,56 +27,55 @@ namespace caffe { */ template class ArgMaxLayer: public Layer { - public: - /** - * @param param provides ArgMaxParameter argmax_param, - * with ArgMaxLayer options: - * - top_k (\b optional uint, default 1). - * the number @f$ K @f$ of maximal items to output. - * - out_max_val (\b optional bool, default false). - * if set, output a vector of pairs (max_ind, max_val) for each image. - */ - explicit ArgMaxLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "ArgMax"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val - * @f$ (N \times 2 \times K \times 1) @f$ - * the computed outputs @f$ - * y_n = \arg\max\limits_i x_{ni} - * @f$ (for @f$ K = 1 @f$). - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - NOT_IMPLEMENTED; - } - bool out_max_val_; - size_t top_k_; + public: + /** + * @param param provides ArgMaxParameter argmax_param, + * with ArgMaxLayer options: + * - top_k (\b optional uint, default 1). + * the number @f$ K @f$ of maximal items to output. + * - out_max_val (\b optional bool, default false). + * if set, output a vector of pairs (max_ind, max_val) for each image. + */ + explicit ArgMaxLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ArgMax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val + * @f$ (N \times 2 \times K \times 1) @f$ + * the computed outputs @f$ + * y_n = \arg\max\limits_i x_{ni} + * @f$ (for @f$ K = 1 @f$). + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + bool out_max_val_; + size_t top_k_; }; /** @@ -85,79 +84,78 @@ class ArgMaxLayer: public Layer { */ template class ConcatLayer: public Layer { - public: - explicit ConcatLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Concat"; - } - virtual inline int MinBottomBlobs() const { - return 2; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_1 @f$ - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_2 @f$ - * -# ... - * - K @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_K @f$ - * @param top output Blob vector (length 1) - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * the concatenated output @f$ - * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to concatenated outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top gradient - * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the - * inputs @f$ - * \left[ \begin{array}{cccc} - * \frac{\partial E}{\partial x_1} & - * \frac{\partial E}{\partial x_2} & - * ... & - * \frac{\partial E}{\partial x_K} - * \end{array} \right] = - * \frac{\partial E}{\partial y} - * @f$ - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_concats_; - int concat_input_size_; - int concat_axis_; + public: + explicit ConcatLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Concat"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_1 @f$ + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_2 @f$ + * -# ... + * - K @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_K @f$ + * @param top output Blob vector (length 1) + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * the concatenated output @f$ + * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to concatenated outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top gradient + * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the + * inputs @f$ + * \left[ \begin{array}{cccc} + * \frac{\partial E}{\partial x_1} & + * \frac{\partial E}{\partial x_2} & + * ... & + * \frac{\partial E}{\partial x_K} + * \end{array} \right] = + * \frac{\partial E}{\partial y} + * @f$ + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_concats_; + int concat_input_size_; + int concat_axis_; }; /** @@ -168,41 +166,40 @@ class ConcatLayer: public Layer { */ template class EltwiseLayer: public Layer { - public: - explicit EltwiseLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Eltwise"; - } - virtual inline int MinBottomBlobs() const { - return 2; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - EltwiseParameter_EltwiseOp op_; - vector coeffs_; - Blob max_idx_; - - bool stable_prod_grad_; + public: + explicit EltwiseLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Eltwise"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + EltwiseParameter_EltwiseOp op_; + vector coeffs_; + Blob max_idx_; + + bool stable_prod_grad_; }; /** @@ -213,67 +210,66 @@ class EltwiseLayer: public Layer { */ template class FilterLayer: public Layer { - public: - explicit FilterLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Filter"; - } - virtual inline int MinBottomBlobs() const { - return 2; - } - virtual inline int MinTopBlobs() const { - return 1; - } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_1 @f$ - * -# ... - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_K @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the selector blob - * @param top output Blob vector (length 1+) - * -# @f$ (S \times C \times H \times W) @f$ () - * the filtered output @f$ x_1 @f$ - * where S is the number of items - * that haven't been filtered - * @f$ (S \times C \times H \times W) @f$ - * the filtered output @f$ x_K @f$ - * where S is the number of items - * that haven't been filtered - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the forwarded inputs. - * - * @param top output Blob vector (length 1+), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2+), into which the top error - * gradient is copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool first_reshape_; - vector indices_to_forward_; + public: + explicit FilterLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Filter"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_1 @f$ + * -# ... + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_K @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the selector blob + * @param top output Blob vector (length 1+) + * -# @f$ (S \times C \times H \times W) @f$ () + * the filtered output @f$ x_1 @f$ + * where S is the number of items + * that haven't been filtered + * @f$ (S \times C \times H \times W) @f$ + * the filtered output @f$ x_K @f$ + * where S is the number of items + * that haven't been filtered + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the forwarded inputs. + * + * @param top output Blob vector (length 1+), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2+), into which the top error + * gradient is copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool first_reshape_; + vector indices_to_forward_; }; /** @@ -288,47 +284,46 @@ class FilterLayer: public Layer { */ template class FlattenLayer: public Layer { - public: - explicit FlattenLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Flatten"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs - * @param top output Blob vector (length 1) - * -# @f$ (N \times CHW \times 1 \times 1) @f$ - * the outputs -- i.e., the (virtually) copied, flattened inputs - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top error - * gradient is (virtually) copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit FlattenLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Flatten"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs + * @param top output Blob vector (length 1) + * -# @f$ (N \times CHW \times 1 \times 1) @f$ + * the outputs -- i.e., the (virtually) copied, flattened inputs + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top error + * gradient is (virtually) copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -339,41 +334,40 @@ class FlattenLayer: public Layer { */ template class InnerProductLayer: public Layer { - public: - explicit InnerProductLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "InnerProduct"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int M_; - int K_; - int N_; - bool bias_term_; - Blob bias_multiplier_; + public: + explicit InnerProductLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "InnerProduct"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int M_; + int K_; + int N_; + bool bias_term_; + Blob bias_multiplier_; }; /** @@ -383,39 +377,38 @@ class InnerProductLayer: public Layer { */ template class MVNLayer: public Layer { - public: - explicit MVNLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "MVN"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob mean_, variance_, temp_; - - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - Dtype eps_; + public: + explicit MVNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MVN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob mean_, variance_, temp_; + + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + Dtype eps_; }; /* @@ -426,48 +419,47 @@ class MVNLayer: public Layer { */ template class ReshapeLayer: public Layer { - public: - explicit ReshapeLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Reshape"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { - } - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { - } - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - - /// @brief vector of axes indices whose dimensions we'll copy from the bottom - vector copy_axes_; - /// @brief the index of the axis whose dimension we infer, or -1 if none - int inferred_axis_; - /// @brief the product of the "constant" output dimensions - int constant_count_; + public: + explicit ReshapeLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reshape"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + /// @brief vector of axes indices whose dimensions we'll copy from the bottom + vector copy_axes_; + /// @brief the index of the axis whose dimension we infer, or -1 if none + int inferred_axis_; + /// @brief the product of the "constant" output dimensions + int constant_count_; }; /** @@ -479,48 +471,47 @@ class ReshapeLayer: public Layer { */ template class ReductionLayer: public Layer { - public: - explicit ReductionLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Reduction"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief the reduction operation performed by the layer - ReductionParameter_ReductionOp op_; - /// @brief a scalar coefficient applied to all outputs - Dtype coeff_; - /// @brief the index of the first input axis to reduce - int axis_; - /// @brief the number of reductions performed - int num_; - /// @brief the input size of each reduction - int dim_; - /// @brief a helper Blob used for summation (op_ == SUM) - Blob sum_multiplier_; + public: + explicit ReductionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reduction"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief the reduction operation performed by the layer + ReductionParameter_ReductionOp op_; + /// @brief a scalar coefficient applied to all outputs + Dtype coeff_; + /// @brief the index of the first input axis to reduce + int axis_; + /// @brief the number of reductions performed + int num_; + /// @brief the input size of each reduction + int dim_; + /// @brief a helper Blob used for summation (op_ == SUM) + Blob sum_multiplier_; }; /** @@ -529,37 +520,36 @@ class ReductionLayer: public Layer { */ template class SilenceLayer: public Layer { - public: - explicit SilenceLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - } - - virtual inline const char* type() const { - return "Silence"; - } - virtual inline int MinBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 0; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { - } - // We can't define Forward_gpu here, since STUB_GPU will provide - // its own definition for CPU_ONLY mode. - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit SilenceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "Silence"; + } + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + // We can't define Forward_gpu here, since STUB_GPU will provide + // its own definition for CPU_ONLY mode. + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -569,42 +559,41 @@ class SilenceLayer: public Layer { */ template class SoftmaxLayer: public Layer { - public: - explicit SoftmaxLayer(const LayerParameter& param) - : - Layer(param) { - } - ~SoftmaxLayer(); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Softmax"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int outer_num_; - int inner_num_; - int softmax_axis_; - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - /// scale is an intermediate Blob to hold temporary results. - Blob scale_; + public: + explicit SoftmaxLayer(const LayerParameter& param) + : Layer(param) { + } + ~SoftmaxLayer(); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Softmax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int outer_num_; + int inner_num_; + int softmax_axis_; + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + /// scale is an intermediate Blob to hold temporary results. + Blob scale_; }; #ifdef USE_CUDNN @@ -614,25 +603,25 @@ class SoftmaxLayer: public Layer { */ template class CuDNNSoftmaxLayer : public SoftmaxLayer { - public: - explicit CuDNNSoftmaxLayer(const LayerParameter& param) - : SoftmaxLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNSoftmaxLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNSoftmaxLayer(const LayerParameter& param) + : SoftmaxLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNSoftmaxLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -644,36 +633,35 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { */ template class SplitLayer: public Layer { - public: - explicit SplitLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Split"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int MinTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - cl_kernel gpu_add_kernel; + public: + explicit SplitLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Split"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + cl_kernel gpu_add_kernel; }; /** @@ -684,41 +672,40 @@ class SplitLayer: public Layer { */ template class SliceLayer: public Layer { - public: - explicit SliceLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Slice"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int MinTopBlobs() const { - return 2; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_slices_; - int slice_size_; - int slice_axis_; - vector slice_point_; + public: + explicit SliceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Slice"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 2; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_slices_; + int slice_size_; + int slice_axis_; + vector slice_point_; }; } // namespace caffe diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index e93c4fe8..d4f526b3 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -26,96 +26,94 @@ namespace caffe { */ template class BaseDataLayer: public Layer { - public: - explicit BaseDataLayer(const LayerParameter& param); - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden except by the BasePrefetchingDataLayer. - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - } - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - } - - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - - protected: - TransformationParameter transform_param_; - shared_ptr > data_transformer_; - bool output_labels_; + public: + explicit BaseDataLayer(const LayerParameter& param); + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden except by the BasePrefetchingDataLayer. + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top) { + } + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + protected: + TransformationParameter transform_param_; + shared_ptr > data_transformer_; + bool output_labels_; }; template -class BasePrefetchingDataLayer: - public BaseDataLayer, public InternalThread { - public: - explicit BasePrefetchingDataLayer(const LayerParameter& param) - : - BaseDataLayer(param) { - } - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden. - void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - virtual void CreatePrefetchThread(); - virtual void JoinPrefetchThread(); - // The thread's function - virtual void InternalThreadEntry() { - } - - protected: - Blob prefetch_data_; - Blob prefetch_label_; - Blob transformed_data_; +class BasePrefetchingDataLayer: public BaseDataLayer, + public InternalThread { + public: + explicit BasePrefetchingDataLayer(const LayerParameter& param) + : BaseDataLayer(param) { + } + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden. + void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + virtual void CreatePrefetchThread(); + virtual void JoinPrefetchThread(); + // The thread's function + virtual void InternalThreadEntry() { + } + + protected: + Blob prefetch_data_; + Blob prefetch_label_; + Blob transformed_data_; }; template class DataLayer: public BasePrefetchingDataLayer { - public: - explicit DataLayer(const LayerParameter& param) - : - BasePrefetchingDataLayer(param) { - } - virtual ~DataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Data"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int MinTopBlobs() const { - return 1; - } - virtual inline int MaxTopBlobs() const { - return 2; - } - - protected: - virtual void InternalThreadEntry(); - - shared_ptr db_; - shared_ptr cursor_; + public: + explicit DataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~DataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + virtual void InternalThreadEntry(); + + shared_ptr db_; + shared_ptr cursor_; }; /** @@ -125,42 +123,41 @@ class DataLayer: public BasePrefetchingDataLayer { */ template class DummyDataLayer: public Layer { - public: - explicit DummyDataLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - } - - virtual inline const char* type() const { - return "DummyData"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int MinTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - - vector > > fillers_; - vector refill_; + public: + explicit DummyDataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "DummyData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + vector > > fillers_; + vector refill_; }; /** @@ -170,51 +167,50 @@ class DummyDataLayer: public Layer { */ template class HDF5DataLayer: public Layer { - public: - explicit HDF5DataLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual ~HDF5DataLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - } - - virtual inline const char* type() const { - return "HDF5Data"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int MinTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - } - virtual void LoadHDF5FileData(const char* filename); - - std::vector hdf_filenames_; - unsigned int num_files_; - unsigned int current_file_; - hsize_t current_row_; - std::vector > > hdf_blobs_; - std::vector data_permutation_; - std::vector file_permutation_; + public: + explicit HDF5DataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~HDF5DataLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void LoadHDF5FileData(const char* filename); + + std::vector hdf_filenames_; + unsigned int num_files_; + unsigned int current_file_; + hsize_t current_row_; + std::vector > > hdf_blobs_; + std::vector data_permutation_; + std::vector file_permutation_; }; /** @@ -224,50 +220,49 @@ class HDF5DataLayer: public Layer { */ template class HDF5OutputLayer: public Layer { - public: - explicit HDF5OutputLayer(const LayerParameter& param) - : - Layer(param), file_opened_(false) { - } - virtual ~HDF5OutputLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - } - - virtual inline const char* type() const { - return "HDF5Output"; - } - // TODO: no limit on the number of blobs - virtual inline int ExactNumBottomBlobs() const { - return 2; - } - virtual inline int ExactNumTopBlobs() const { - return 0; - } - - inline std::string file_name() const { - return file_name_; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void SaveBlobs(); - - bool file_opened_; - std::string file_name_; - hid_t file_id_; - Blob data_blob_; - Blob label_blob_; + public: + explicit HDF5OutputLayer(const LayerParameter& param) + : Layer(param), file_opened_(false) { + } + virtual ~HDF5OutputLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Output"; + } + // TODO: no limit on the number of blobs + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + inline std::string file_name() const { + return file_name_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void SaveBlobs(); + + bool file_opened_; + std::string file_name_; + hid_t file_id_; + Blob data_blob_; + Blob label_blob_; }; /** @@ -277,32 +272,31 @@ class HDF5OutputLayer: public Layer { */ template class ImageDataLayer: public BasePrefetchingDataLayer { - public: - explicit ImageDataLayer(const LayerParameter& param) - : - BasePrefetchingDataLayer(param) { - } - virtual ~ImageDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "ImageData"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int ExactNumTopBlobs() const { - return 2; - } - - protected: - shared_ptr prefetch_rng_; - virtual void ShuffleImages(); - virtual void InternalThreadEntry(); - - vector > lines_; - int lines_id_; + public: + explicit ImageDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~ImageDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ImageData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + shared_ptr prefetch_rng_; + virtual void ShuffleImages(); + virtual void InternalThreadEntry(); + + vector > lines_; + int lines_id_; }; /** @@ -312,58 +306,57 @@ class ImageDataLayer: public BasePrefetchingDataLayer { */ template class MemoryDataLayer: public BaseDataLayer { - public: - explicit MemoryDataLayer(const LayerParameter& param) - : - BaseDataLayer(param), has_new_data_(false) { - } - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "MemoryData"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int ExactNumTopBlobs() const { - return 2; - } - - virtual void AddDatumVector(const vector& datum_vector); - virtual void AddMatVector(const vector& mat_vector, - const vector& labels); - - // Reset should accept const pointers, but can't, because the memory - // will be given to Blob, which is mutable - void Reset(Dtype* data, Dtype* label, int n); - void set_batch_size(int new_size); - - int batch_size() { - return batch_size_; - } - int channels() { - return channels_; - } - int height() { - return height_; - } - int width() { - return width_; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - int batch_size_, channels_, height_, width_, size_; - Dtype* data_; - Dtype* labels_; - int n_; - size_t pos_; - Blob added_data_; - Blob added_label_; - bool has_new_data_; + public: + explicit MemoryDataLayer(const LayerParameter& param) + : BaseDataLayer(param), has_new_data_(false) { + } + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MemoryData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + virtual void AddDatumVector(const vector& datum_vector); + virtual void AddMatVector(const vector& mat_vector, + const vector& labels); + + // Reset should accept const pointers, but can't, because the memory + // will be given to Blob, which is mutable + void Reset(Dtype* data, Dtype* label, int n); + void set_batch_size(int new_size); + + int batch_size() { + return batch_size_; + } + int channels() { + return channels_; + } + int height() { + return height_; + } + int width() { + return width_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + int batch_size_, channels_, height_, width_, size_; + Dtype* data_; + Dtype* labels_; + int n_; + size_t pos_; + Blob added_data_; + Blob added_label_; + bool has_new_data_; }; /** @@ -374,42 +367,41 @@ class MemoryDataLayer: public BaseDataLayer { */ template class WindowDataLayer: public BasePrefetchingDataLayer { - public: - explicit WindowDataLayer(const LayerParameter& param) - : - BasePrefetchingDataLayer(param) { - } - virtual ~WindowDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "WindowData"; - } - virtual inline int ExactNumBottomBlobs() const { - return 0; - } - virtual inline int ExactNumTopBlobs() const { - return 2; - } - - protected: - virtual unsigned int PrefetchRand(); - virtual void InternalThreadEntry(); - - shared_ptr prefetch_rng_; - vector > > image_database_; - enum WindowField { - IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM - }; - vector > fg_windows_; - vector > bg_windows_; - Blob data_mean_; - vector mean_values_; - bool has_mean_file_; - bool has_mean_values_; - bool cache_images_; - vector > image_database_cache_; + public: + explicit WindowDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~WindowDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "WindowData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + virtual unsigned int PrefetchRand(); + virtual void InternalThreadEntry(); + + shared_ptr prefetch_rng_; + vector > > image_database_; + enum WindowField { + IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM + }; + vector > fg_windows_; + vector > bg_windows_; + Blob data_mean_; + vector mean_values_; + bool has_mean_file_; + bool has_mean_values_; + bool cache_images_; + vector > image_database_cache_; }; } // namespace caffe diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp index c283a244..daa4eee0 100644 --- a/include/caffe/data_transformer.hpp +++ b/include/caffe/data_transformer.hpp @@ -15,134 +15,134 @@ namespace caffe { */ template class DataTransformer { - public: - explicit DataTransformer(const TransformationParameter& param, Phase phase); - virtual ~DataTransformer() { - } - - /** - * @brief Initialize the Random number generations if needed by the - * transformation. - */ - void InitRand(); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to the data. - * - * @param datum - * Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See data_layer.cpp for an example. - */ - void Transform(const Datum& datum, Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Datum. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & datum_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Mat. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & mat_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a cv::Mat - * - * @param cv_img - * cv::Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See image_data_layer.cpp for an example. - */ - void Transform(const cv::Mat& cv_img, Blob* transformed_blob); - - /** - * @brief Applies the same transformation defined in the data layer's - * transform_param block to all the num images in a input_blob. - * - * @param input_blob - * A Blob containing the data to be transformed. It applies the same - * transformation to all the num images in the blob. - * @param transformed_blob - * This is destination blob, it will contain as many images as the - * input blob. It can be part of top blob's data. - */ - void Transform(Blob* input_blob, Blob* transformed_blob); - - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param datum - * Datum containing the data to be transformed. - */ - vector InferBlobShape(const Datum& datum); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - */ - vector InferBlobShape(const vector & datum_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - */ - vector InferBlobShape(const vector & mat_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param cv_img - * cv::Mat containing the data to be transformed. - */ - vector InferBlobShape(const cv::Mat& cv_img); - - protected: - /** - * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). - * - * @param n - * The upperbound (exclusive) value of the random number. - * @return - * A uniformly random integer value from ({0, 1, ..., n-1}). - */ - virtual int Rand(int n); - - void Transform(const Datum& datum, Dtype* transformed_data); - // Tranformation parameters - TransformationParameter param_; - - shared_ptr rng_; - Phase phase_; - Blob data_mean_; - vector mean_values_; + public: + explicit DataTransformer(const TransformationParameter& param, Phase phase); + virtual ~DataTransformer() { + } + + /** + * @brief Initialize the Random number generations if needed by the + * transformation. + */ + void InitRand(); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to the data. + * + * @param datum + * Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See data_layer.cpp for an example. + */ + void Transform(const Datum& datum, Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Datum. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & datum_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Mat. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & mat_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a cv::Mat + * + * @param cv_img + * cv::Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See image_data_layer.cpp for an example. + */ + void Transform(const cv::Mat& cv_img, Blob* transformed_blob); + + /** + * @brief Applies the same transformation defined in the data layer's + * transform_param block to all the num images in a input_blob. + * + * @param input_blob + * A Blob containing the data to be transformed. It applies the same + * transformation to all the num images in the blob. + * @param transformed_blob + * This is destination blob, it will contain as many images as the + * input blob. It can be part of top blob's data. + */ + void Transform(Blob* input_blob, Blob* transformed_blob); + + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param datum + * Datum containing the data to be transformed. + */ + vector InferBlobShape(const Datum& datum); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + */ + vector InferBlobShape(const vector & datum_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + */ + vector InferBlobShape(const vector & mat_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param cv_img + * cv::Mat containing the data to be transformed. + */ + vector InferBlobShape(const cv::Mat& cv_img); + + protected: + /** + * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). + * + * @param n + * The upperbound (exclusive) value of the random number. + * @return + * A uniformly random integer value from ({0, 1, ..., n-1}). + */ + virtual int Rand(int n); + + void Transform(const Datum& datum, Dtype* transformed_data); + // Tranformation parameters + TransformationParameter param_; + + shared_ptr rng_; + Phase phase_; + Blob data_mean_; + vector mean_values_; }; } // namespace caffe diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 2d71b333..1d9fa6fe 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -33,51 +33,50 @@ namespace caffe { class Device { - public: - Device() - : - numPlatforms(0), numDevices(0), device_id(INT_MIN) { - } - ~Device(); - cl_uint numPlatforms; - cl_platform_id * platformIDs; - char platformName[64]; - char openclVersion[64]; - cl_uint numDevices; - cl_device_id * DeviceIDs; + public: + Device() + : numPlatforms(0), numDevices(0), device_id(INT_MIN) { + } + ~Device(); + cl_uint numPlatforms; + cl_platform_id * platformIDs; + char platformName[64]; + char openclVersion[64]; + cl_uint numDevices; + cl_device_id * DeviceIDs; - cl_context Context; - cl_command_queue CommandQueue; - cl_command_queue CommandQueue_helper; - cl_program Program; - cl_device_id * pDevices; - int device_id; + cl_context Context; + cl_command_queue CommandQueue; + cl_command_queue CommandQueue_helper; + cl_program Program; + cl_device_id * pDevices; + int device_id; - clblasOrder col; - clblasOrder row; - std::map Kernels; + clblasOrder col; + clblasOrder row; + std::map Kernels; - cl_int Init(int device_id = -1); - cl_int ConvertToString(std::string pFileName, std::string &Str); - void DisplayPlatformInfo(); - void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); + cl_int Init(int device_id = -1); + cl_int ConvertToString(std::string pFileName, std::string &Str); + void DisplayPlatformInfo(); + void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); - void GetDeviceInfo(); - void DeviceQuery(); - int GetDevice() { - return device_id; - } - ; - void BuildProgram(std::string kernel_dir); + void GetDeviceInfo(); + void DeviceQuery(); + int GetDevice() { + return device_id; + } + ; + void BuildProgram(std::string kernel_dir); - template - void DisplayDeviceInfo(cl_device_id id, cl_device_info name, - std::string str); - template - void appendBitfield(T info, T value, std::string name, std::string &str); + template + void DisplayDeviceInfo(cl_device_id id, cl_device_info name, + std::string str); + template + void appendBitfield(T info, T value, std::string name, std::string &str); - cl_kernel GetKernel(std::string kernel_name); - void ReleaseKernels(); + cl_kernel GetKernel(std::string kernel_name); + void ReleaseKernels(); }; extern std::string buildOption; extern Device amdDevice; diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index c431dc94..ab9d6b39 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -18,92 +18,88 @@ namespace caffe { /// @brief Fills a Blob with constant or randomly-generated data. template class Filler { - public: - explicit Filler(const FillerParameter& param) - : - filler_param_(param) { - } - virtual ~Filler() { - } - virtual void Fill(Blob* blob) = 0; - protected: - FillerParameter filler_param_; + public: + explicit Filler(const FillerParameter& param) + : filler_param_(param) { + } + virtual ~Filler() { + } + virtual void Fill(Blob* blob) = 0; + protected: + FillerParameter filler_param_; }; // class Filler /// @brief Fills a Blob with constant values @f$ x = 0 @f$. template class ConstantFiller: public Filler { - public: - explicit ConstantFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - const int count = blob->count(); - const Dtype value = this->filler_param_.value(); - CHECK(count); - for (int i = 0; i < count; ++i) { - data[i] = value; - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit ConstantFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + const int count = blob->count(); + const Dtype value = this->filler_param_.value(); + CHECK(count); + for (int i = 0; i < count; ++i) { + data[i] = value; + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$. template class UniformFiller: public Filler { - public: - explicit UniformFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit UniformFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), + Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$. template class GaussianFiller: public Filler { - public: - explicit GaussianFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - CHECK(blob->count()); - caffe_rng_gaussian(blob->count(), - Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); - int sparse = this->filler_param_.sparse(); - CHECK_GE(sparse, -1); - if (sparse >= 0) { - // Sparse initialization is implemented for "weight" blobs; i.e. matrices. - // These have num == channels == 1; width is number of inputs; height is - // number of outputs. The 'sparse' variable specifies the mean number - // of non-zero input weights for a given output. - CHECK_GE(blob->num_axes(), 1); - const int num_outputs = blob->shape(0); - Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); - rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); - int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); - caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); - for (int i = 0; i < blob->count(); ++i) { - data[i] *= mask[i]; - } - } - } + public: + explicit GaussianFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + CHECK(blob->count()); + caffe_rng_gaussian(blob->count(), + Dtype(this->filler_param_.mean()), Dtype(this->filler_param_.std()), + blob->mutable_cpu_data()); + int sparse = this->filler_param_.sparse(); + CHECK_GE(sparse, -1); + if (sparse >= 0) { + // Sparse initialization is implemented for "weight" blobs; i.e. matrices. + // These have num == channels == 1; width is number of inputs; height is + // number of outputs. The 'sparse' variable specifies the mean number + // of non-zero input weights for a given output. + CHECK_GE(blob->num_axes(), 1); + const int num_outputs = blob->shape(0); + Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); + rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); + int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); + caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); + for (int i = 0; i < blob->count(); ++i) { + data[i] *= mask[i]; + } + } + } - protected: - shared_ptr rand_vec_; + protected: + shared_ptr rand_vec_; }; /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$ @@ -111,31 +107,30 @@ class GaussianFiller: public Filler { */ template class PositiveUnitballFiller: public Filler { - public: - explicit PositiveUnitballFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - DCHECK(blob->count()); - caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); - // We expect the filler to not be called very frequently, so we will - // just use a simple implementation - int dim = blob->count() / blob->num(); - CHECK(dim); - for (int i = 0; i < blob->num(); ++i) { - Dtype sum = 0; - for (int j = 0; j < dim; ++j) { - sum += data[i * dim + j]; - } - for (int j = 0; j < dim; ++j) { - data[i * dim + j] /= sum; - } - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit PositiveUnitballFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + DCHECK(blob->count()); + caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); + // We expect the filler to not be called very frequently, so we will + // just use a simple implementation + int dim = blob->count() / blob->num(); + CHECK(dim); + for (int i = 0; i < blob->num(); ++i) { + Dtype sum = 0; + for (int j = 0; j < dim; ++j) { + sum += data[i * dim + j]; + } + for (int j = 0; j < dim; ++j) { + data[i * dim + j] /= sum; + } + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -156,29 +151,28 @@ class PositiveUnitballFiller: public Filler { */ template class XavierFiller: public Filler { - public: - explicit XavierFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; - } - Dtype scale = sqrt(Dtype(3) / n); - caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit XavierFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype scale = sqrt(Dtype(3) / n); + caffe_rng_uniform(blob->count(), -scale, scale, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -200,29 +194,28 @@ class XavierFiller: public Filler { */ template class MSRAFiller: public Filler { - public: - explicit MSRAFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; - } - Dtype std = sqrt(Dtype(2) / n); - caffe_rng_gaussian(blob->count(), Dtype(0), std, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit MSRAFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype std = sqrt(Dtype(2) / n); + caffe_rng_gaussian(blob->count(), Dtype(0), std, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /*! @@ -260,25 +253,24 @@ class MSRAFiller: public Filler { */ template class BilinearFiller: public Filler { - public: - explicit BilinearFiller(const FillerParameter& param) - : - Filler(param) { - } - virtual void Fill(Blob* blob) { - CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; - CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; - Dtype* data = blob->mutable_cpu_data(); - int f = ceil(blob->width() / 2.); - float c = (2 * f - 1 - f % 2) / (2. * f); - for (int i = 0; i < blob->count(); ++i) { - float x = i % blob->width(); - float y = (i / blob->width()) % blob->height(); - data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); - } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } + public: + explicit BilinearFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; + CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; + Dtype* data = blob->mutable_cpu_data(); + int f = ceil(blob->width() / 2.); + float c = (2 * f - 1 - f % 2) / (2. * f); + for (int i = 0; i < blob->count(); ++i) { + float x = i % blob->width(); + float y = (i / blob->width()) % blob->height(); + data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /** @@ -289,25 +281,25 @@ class BilinearFiller: public Filler { */ template Filler* GetFiller(const FillerParameter& param) { - const std::string& type = param.type(); - if (type == "constant") { - return new ConstantFiller(param); - } else if (type == "gaussian") { - return new GaussianFiller(param); - } else if (type == "positive_unitball") { - return new PositiveUnitballFiller(param); - } else if (type == "uniform") { - return new UniformFiller(param); - } else if (type == "xavier") { - return new XavierFiller(param); - } else if (type == "msra") { - return new MSRAFiller(param); - } else if (type == "bilinear") { - return new BilinearFiller(param); - } else { - CHECK(false) << "Unknown filler name: " << param.type(); - } - return (Filler*) (NULL); + const std::string& type = param.type(); + if (type == "constant") { + return new ConstantFiller(param); + } else if (type == "gaussian") { + return new GaussianFiller(param); + } else if (type == "positive_unitball") { + return new PositiveUnitballFiller(param); + } else if (type == "uniform") { + return new UniformFiller(param); + } else if (type == "xavier") { + return new XavierFiller(param); + } else if (type == "msra") { + return new MSRAFiller(param); + } else if (type == "bilinear") { + return new BilinearFiller(param); + } else { + CHECK(false) << "Unknown filler name: " << param.type(); + } + return (Filler*) (NULL); } } // namespace caffe diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index 677deea4..dd8ae8bf 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -19,28 +19,27 @@ namespace caffe { * by reimplementing the virutal function InternalThreadEntry. */ class InternalThread { - public: - InternalThread() - : - thread_() { - } - virtual ~InternalThread(); + public: + InternalThread() + : thread_() { + } + virtual ~InternalThread(); - /** Returns true if the thread was successfully started. **/ - bool StartInternalThread(); + /** Returns true if the thread was successfully started. **/ + bool StartInternalThread(); - /** Will not return until the internal thread has exited. */ - bool WaitForInternalThreadToExit(); + /** Will not return until the internal thread has exited. */ + bool WaitForInternalThreadToExit(); - bool is_started() const; + bool is_started() const; - protected: - /* Implement this method in your subclass - with the code you want your thread to run. */ - virtual void InternalThreadEntry() { - } + protected: + /* Implement this method in your subclass + with the code you want your thread to run. */ + virtual void InternalThreadEntry() { + } - shared_ptr thread_; + shared_ptr thread_; }; } // namespace caffe diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 5651e814..c346ede1 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -25,403 +25,400 @@ namespace caffe { */ template class Layer { - public: - /** - * You should not implement your own constructor. Any set up code should go - * to SetUp(), where the dimensions of the bottom blobs are provided to the - * layer. - */ - explicit Layer(const LayerParameter& param) - : - layer_param_(param) { - // Set phase and copy blobs (if there are any). - phase_ = param.phase(); - if (layer_param_.blobs_size() > 0) { - blobs_.resize(layer_param_.blobs_size()); - for (int i = 0; i < layer_param_.blobs_size(); ++i) { - blobs_[i].reset(new Blob()); - blobs_[i]->FromProto(layer_param_.blobs(i)); - } - } - } - virtual ~Layer() { - } + public: + /** + * You should not implement your own constructor. Any set up code should go + * to SetUp(), where the dimensions of the bottom blobs are provided to the + * layer. + */ + explicit Layer(const LayerParameter& param) + : layer_param_(param) { + // Set phase and copy blobs (if there are any). + phase_ = param.phase(); + if (layer_param_.blobs_size() > 0) { + blobs_.resize(layer_param_.blobs_size()); + for (int i = 0; i < layer_param_.blobs_size(); ++i) { + blobs_[i].reset(new Blob()); + blobs_[i]->FromProto(layer_param_.blobs(i)); + } + } + } + virtual ~Layer() { + } - /** - * @brief Implements common layer setup functionality. - * - * @param bottom the preshaped input blobs - * @param top - * the allocated but unshaped output blobs, to be shaped by Reshape - * - * Checks that the number of bottom and top blobs is correct. - * Calls LayerSetUp to do special layer setup for individual layer types, - * followed by Reshape to set up sizes of top blobs and internal buffers. - * Sets up the loss weight multiplier blobs for any non-zero loss weights. - * This method may not be overridden. - */ - void SetUp(const vector*>& bottom, - const vector*>& top) { - CheckBlobCounts(bottom, top); - LayerSetUp(bottom, top); - Reshape(bottom, top); - SetLossWeights(top); - } + /** + * @brief Implements common layer setup functionality. + * + * @param bottom the preshaped input blobs + * @param top + * the allocated but unshaped output blobs, to be shaped by Reshape + * + * Checks that the number of bottom and top blobs is correct. + * Calls LayerSetUp to do special layer setup for individual layer types, + * followed by Reshape to set up sizes of top blobs and internal buffers. + * Sets up the loss weight multiplier blobs for any non-zero loss weights. + * This method may not be overridden. + */ + void SetUp(const vector*>& bottom, + const vector*>& top) { + CheckBlobCounts(bottom, top); + LayerSetUp(bottom, top); + Reshape(bottom, top); + SetLossWeights(top); + } - /** - * @brief Does layer-specific setup: your layer should implement this function - * as well as Reshape. - * - * @param bottom - * the preshaped input blobs, whose data fields store the input data for - * this layer - * @param top - * the allocated but unshaped output blobs - * - * This method should do one-time layer specific setup. This includes reading - * and processing relevent parameters from the layer_param_. - * Setting up the shapes of top blobs and internal buffers should be done in - * Reshape, which will be called before the forward pass to - * adjust the top blob sizes. - */ - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { - } + /** + * @brief Does layer-specific setup: your layer should implement this function + * as well as Reshape. + * + * @param bottom + * the preshaped input blobs, whose data fields store the input data for + * this layer + * @param top + * the allocated but unshaped output blobs + * + * This method should do one-time layer specific setup. This includes reading + * and processing relevent parameters from the layer_param_. + * Setting up the shapes of top blobs and internal buffers should be done in + * Reshape, which will be called before the forward pass to + * adjust the top blob sizes. + */ + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + } - /** - * @brief Adjust the shapes of top blobs and internal buffers to accomodate - * the shapes of the bottom blobs. - * - * @param bottom the input blobs, with the requested input shapes - * @param top the top blobs, which should be reshaped as needed - * - * This method should reshape top blobs as needed according to the shapes - * of the bottom (input) blobs, as well as reshaping any internal buffers - * and making any other necessary adjustments so that the layer can - * accomodate the bottom blobs. - */ - virtual void Reshape(const vector*>& bottom, - const vector*>& top) = 0; + /** + * @brief Adjust the shapes of top blobs and internal buffers to accomodate + * the shapes of the bottom blobs. + * + * @param bottom the input blobs, with the requested input shapes + * @param top the top blobs, which should be reshaped as needed + * + * This method should reshape top blobs as needed according to the shapes + * of the bottom (input) blobs, as well as reshaping any internal buffers + * and making any other necessary adjustments so that the layer can + * accomodate the bottom blobs. + */ + virtual void Reshape(const vector*>& bottom, + const vector*>& top) = 0; - /** - * @brief Given the bottom blobs, compute the top blobs and the loss. - * - * @param bottom - * the input blobs, whose data fields store the input data for this layer - * @param top - * the preshaped output blobs, whose data fields will store this layers' - * outputs - * \return The total loss from the layer. - * - * The Forward wrapper calls the relevant device wrapper function - * (Forward_cpu or Forward_gpu) to compute the top blob values given the - * bottom blobs. If the layer has any non-zero loss_weights, the wrapper - * then computes and returns the loss. - * - * Your layer should implement Forward_cpu and (optionally) Forward_gpu. - */ - inline Dtype Forward(const vector*>& bottom, - const vector*>& top); + /** + * @brief Given the bottom blobs, compute the top blobs and the loss. + * + * @param bottom + * the input blobs, whose data fields store the input data for this layer + * @param top + * the preshaped output blobs, whose data fields will store this layers' + * outputs + * \return The total loss from the layer. + * + * The Forward wrapper calls the relevant device wrapper function + * (Forward_cpu or Forward_gpu) to compute the top blob values given the + * bottom blobs. If the layer has any non-zero loss_weights, the wrapper + * then computes and returns the loss. + * + * Your layer should implement Forward_cpu and (optionally) Forward_gpu. + */ + inline Dtype Forward(const vector*>& bottom, + const vector*>& top); - /** - * @brief Given the top blob error gradients, compute the bottom blob error - * gradients. - * - * @param top - * the output blobs, whose diff fields store the gradient of the error - * with respect to themselves - * @param propagate_down - * a vector with equal length to bottom, with each index indicating - * whether to propagate the error gradients down to the bottom blob at - * the corresponding index - * @param bottom - * the input blobs, whose diff fields will store the gradient of the error - * with respect to themselves after Backward is run - * - * The Backward wrapper calls the relevant device wrapper function - * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the - * top blob diffs. - * - * Your layer should implement Backward_cpu and (optionally) Backward_gpu. - */ - inline void Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom); + /** + * @brief Given the top blob error gradients, compute the bottom blob error + * gradients. + * + * @param top + * the output blobs, whose diff fields store the gradient of the error + * with respect to themselves + * @param propagate_down + * a vector with equal length to bottom, with each index indicating + * whether to propagate the error gradients down to the bottom blob at + * the corresponding index + * @param bottom + * the input blobs, whose diff fields will store the gradient of the error + * with respect to themselves after Backward is run + * + * The Backward wrapper calls the relevant device wrapper function + * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the + * top blob diffs. + * + * Your layer should implement Backward_cpu and (optionally) Backward_gpu. + */ + inline void Backward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); - /** - * @brief Returns the vector of learnable parameter blobs. - */ - vector > >& blobs() { - return blobs_; - } + /** + * @brief Returns the vector of learnable parameter blobs. + */ + vector > >& blobs() { + return blobs_; + } - /** - * @brief Returns the layer parameter. - */ - const LayerParameter& layer_param() const { - return layer_param_; - } + /** + * @brief Returns the layer parameter. + */ + const LayerParameter& layer_param() const { + return layer_param_; + } - /** - * @brief Writes the layer parameter to a protocol buffer - */ - virtual void ToProto(LayerParameter* param, bool write_diff = false); + /** + * @brief Writes the layer parameter to a protocol buffer + */ + virtual void ToProto(LayerParameter* param, bool write_diff = false); - /** - * @brief Returns the scalar loss associated with a top blob at a given index. - */ - inline Dtype loss(const int top_index) const { - return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); - } + /** + * @brief Returns the scalar loss associated with a top blob at a given index. + */ + inline Dtype loss(const int top_index) const { + return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); + } - /** - * @brief Sets the loss associated with a top blob at a given index. - */ - inline void set_loss(const int top_index, const Dtype value) { - if (loss_.size() <= top_index) { - loss_.resize(top_index + 1, Dtype(0)); - } - loss_[top_index] = value; - } + /** + * @brief Sets the loss associated with a top blob at a given index. + */ + inline void set_loss(const int top_index, const Dtype value) { + if (loss_.size() <= top_index) { + loss_.resize(top_index + 1, Dtype(0)); + } + loss_[top_index] = value; + } - /** - * @brief Returns the layer type. - */ - virtual inline const char* type() const { - return ""; - } + /** + * @brief Returns the layer type. + */ + virtual inline const char* type() const { + return ""; + } - /** - * @brief Returns the exact number of bottom blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of bottom blobs. - */ - virtual inline int ExactNumBottomBlobs() const { - return -1; - } - /** - * @brief Returns the minimum number of bottom blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of bottom blobs. - */ - virtual inline int MinBottomBlobs() const { - return -1; - } - /** - * @brief Returns the maximum number of bottom blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of bottom blobs. - */ - virtual inline int MaxBottomBlobs() const { - return -1; - } - /** - * @brief Returns the exact number of top blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of top blobs. - */ - virtual inline int ExactNumTopBlobs() const { - return -1; - } - /** - * @brief Returns the minimum number of top blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of top blobs. - */ - virtual inline int MinTopBlobs() const { - return -1; - } - /** - * @brief Returns the maximum number of top blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of top blobs. - */ - virtual inline int MaxTopBlobs() const { - return -1; - } - /** - * @brief Returns true if the layer requires an equal number of bottom and - * top blobs. - * - * This method should be overridden to return true if your layer expects an - * equal number of bottom and top blobs. - */ - virtual inline bool EqualNumBottomTopBlobs() const { - return false; - } + /** + * @brief Returns the exact number of bottom blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of bottom blobs. + */ + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of bottom blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of bottom blobs. + */ + virtual inline int MinBottomBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of bottom blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of bottom blobs. + */ + virtual inline int MaxBottomBlobs() const { + return -1; + } + /** + * @brief Returns the exact number of top blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of top blobs. + */ + virtual inline int ExactNumTopBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of top blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of top blobs. + */ + virtual inline int MinTopBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of top blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of top blobs. + */ + virtual inline int MaxTopBlobs() const { + return -1; + } + /** + * @brief Returns true if the layer requires an equal number of bottom and + * top blobs. + * + * This method should be overridden to return true if your layer expects an + * equal number of bottom and top blobs. + */ + virtual inline bool EqualNumBottomTopBlobs() const { + return false; + } - /** - * @brief Return whether "anonymous" top blobs are created automatically - * by the layer. - * - * If this method returns true, Net::Init will create enough "anonymous" top - * blobs to fulfill the requirement specified by ExactNumTopBlobs() or - * MinTopBlobs(). - */ - virtual inline bool AutoTopBlobs() const { - return false; - } + /** + * @brief Return whether "anonymous" top blobs are created automatically + * by the layer. + * + * If this method returns true, Net::Init will create enough "anonymous" top + * blobs to fulfill the requirement specified by ExactNumTopBlobs() or + * MinTopBlobs(). + */ + virtual inline bool AutoTopBlobs() const { + return false; + } - /** - * @brief Return whether to allow force_backward for a given bottom blob - * index. - * - * If AllowForceBackward(i) == false, we will ignore the force_backward - * setting and backpropagate to blob i only if it needs gradient information - * (as is done when force_backward == false). - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } + /** + * @brief Return whether to allow force_backward for a given bottom blob + * index. + * + * If AllowForceBackward(i) == false, we will ignore the force_backward + * setting and backpropagate to blob i only if it needs gradient information + * (as is done when force_backward == false). + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; + } - /** - * @brief Specifies whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - * - * You can safely ignore false values and always compute gradients - * for all parameters, but possibly with wasteful computation. - */ - inline bool param_propagate_down(const int param_id) { - return - (param_propagate_down_.size() > param_id) ? - param_propagate_down_[param_id] : false; - } - /** - * @brief Sets whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - */ - inline void set_param_propagate_down(const int param_id, const bool value) { - if (param_propagate_down_.size() <= param_id) { - param_propagate_down_.resize(param_id + 1, true); - } - param_propagate_down_[param_id] = value; - } + /** + * @brief Specifies whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + * + * You can safely ignore false values and always compute gradients + * for all parameters, but possibly with wasteful computation. + */ + inline bool param_propagate_down(const int param_id) { + return + (param_propagate_down_.size() > param_id) ? + param_propagate_down_[param_id] : false; + } + /** + * @brief Sets whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + */ + inline void set_param_propagate_down(const int param_id, const bool value) { + if (param_propagate_down_.size() <= param_id) { + param_propagate_down_.resize(param_id + 1, true); + } + param_propagate_down_[param_id] = value; + } - protected: - /** The protobuf that stores the layer parameters */ - LayerParameter layer_param_; - /** The phase: TRAIN or TEST */ - Phase phase_; - /** The vector that stores the learnable parameters as a set of blobs. */ - vector > > blobs_; - /** Vector indicating whether to compute the diff of each param blob. */ - vector param_propagate_down_; + protected: + /** The protobuf that stores the layer parameters */ + LayerParameter layer_param_; + /** The phase: TRAIN or TEST */ + Phase phase_; + /** The vector that stores the learnable parameters as a set of blobs. */ + vector > > blobs_; + /** Vector indicating whether to compute the diff of each param blob. */ + vector param_propagate_down_; - /** The vector that indicates whether each top blob has a non-zero weight in - * the objective function. */ - vector loss_; + /** The vector that indicates whether each top blob has a non-zero weight in + * the objective function. */ + vector loss_; - /** @brief Using the CPU device, compute the layer output. */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) = 0; - /** - * @brief Using the GPU device, compute the layer output. - * Fall back to Forward_cpu() if unavailable. - */ - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // LOG(WARNING) << "Using CPU code as backup."; - return Forward_cpu(bottom, top); - } + /** @brief Using the CPU device, compute the layer output. */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) = 0; + /** + * @brief Using the GPU device, compute the layer output. + * Fall back to Forward_cpu() if unavailable. + */ + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // LOG(WARNING) << "Using CPU code as backup."; + return Forward_cpu(bottom, top); + } - /** - * @brief Using the CPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) = 0; - /** - * @brief Using the GPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - * Fall back to Backward_cpu() if unavailable. - */ - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - // LOG(WARNING) << "Using CPU code as backup."; - Backward_cpu(top, propagate_down, bottom); - } + /** + * @brief Using the CPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) = 0; + /** + * @brief Using the GPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + * Fall back to Backward_cpu() if unavailable. + */ + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + // LOG(WARNING) << "Using CPU code as backup."; + Backward_cpu(top, propagate_down, bottom); + } - /** - * Called by the parent Layer's SetUp to check that the number of bottom - * and top Blobs provided as input match the expected numbers specified by - * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. - */ - virtual void CheckBlobCounts(const vector*>& bottom, - const vector*>& top) { - if (ExactNumBottomBlobs() >= 0) { - CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) - << type() << " Layer takes " << ExactNumBottomBlobs() - << " bottom blob(s) as input."; - } - if (MinBottomBlobs() >= 0) { - CHECK_LE(MinBottomBlobs(), bottom.size()) - << type() << " Layer takes at least " << MinBottomBlobs() - << " bottom blob(s) as input."; - } - if (MaxBottomBlobs() >= 0) { - CHECK_GE(MaxBottomBlobs(), bottom.size()) - << type() << " Layer takes at most " << MaxBottomBlobs() - << " bottom blob(s) as input."; - } - if (ExactNumTopBlobs() >= 0) { - CHECK_EQ(ExactNumTopBlobs(), top.size()) - << type() << " Layer produces " << ExactNumTopBlobs() - << " top blob(s) as output."; - } - if (MinTopBlobs() >= 0) { - CHECK_LE(MinTopBlobs(), top.size()) - << type() << " Layer produces at least " << MinTopBlobs() - << " top blob(s) as output."; - } - if (MaxTopBlobs() >= 0) { - CHECK_GE(MaxTopBlobs(), top.size()) - << type() << " Layer produces at most " << MaxTopBlobs() - << " top blob(s) as output."; - } - if (EqualNumBottomTopBlobs()) { - CHECK_EQ(bottom.size(), top.size()) - << type() << " Layer produces one top blob as output for each " - << "bottom blob input."; - } - } + /** + * Called by the parent Layer's SetUp to check that the number of bottom + * and top Blobs provided as input match the expected numbers specified by + * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. + */ + virtual void CheckBlobCounts(const vector*>& bottom, + const vector*>& top) { + if (ExactNumBottomBlobs() >= 0) { + CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) << type() + << " Layer takes " << ExactNumBottomBlobs() + << " bottom blob(s) as input."; + } + if (MinBottomBlobs() >= 0) { + CHECK_LE(MinBottomBlobs(), bottom.size()) << type() + << " Layer takes at least " << MinBottomBlobs() + << " bottom blob(s) as input."; + } + if (MaxBottomBlobs() >= 0) { + CHECK_GE(MaxBottomBlobs(), bottom.size()) << type() + << " Layer takes at most " << MaxBottomBlobs() + << " bottom blob(s) as input."; + } + if (ExactNumTopBlobs() >= 0) { + CHECK_EQ(ExactNumTopBlobs(), top.size()) << type() << " Layer produces " + << ExactNumTopBlobs() << " top blob(s) as output."; + } + if (MinTopBlobs() >= 0) { + CHECK_LE(MinTopBlobs(), top.size()) << type() + << " Layer produces at least " << MinTopBlobs() + << " top blob(s) as output."; + } + if (MaxTopBlobs() >= 0) { + CHECK_GE(MaxTopBlobs(), top.size()) << type() + << " Layer produces at most " << MaxTopBlobs() + << " top blob(s) as output."; + } + if (EqualNumBottomTopBlobs()) { + CHECK_EQ(bottom.size(), top.size()) << type() + << " Layer produces one top blob as output for each " + << "bottom blob input."; + } + } - /** - * Called by SetUp to initialize the weights associated with any top blobs in - * the loss function. Store non-zero loss weights in the diff blob. - */ - inline void SetLossWeights(const vector*>& top) { - const int num_loss_weights = layer_param_.loss_weight_size(); - if (num_loss_weights) { - CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " - "unspecified or specified once per top blob."; - for (int top_id = 0; top_id < top.size(); ++top_id) { - const Dtype loss_weight = layer_param_.loss_weight(top_id); - if (loss_weight == Dtype(0)) { - continue; - } - this->set_loss(top_id, loss_weight); - const int count = top[top_id]->count(); - Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); - caffe_set(count, loss_weight, loss_multiplier); - } - } - } + /** + * Called by SetUp to initialize the weights associated with any top blobs in + * the loss function. Store non-zero loss weights in the diff blob. + */ + inline void SetLossWeights(const vector*>& top) { + const int num_loss_weights = layer_param_.loss_weight_size(); + if (num_loss_weights) { + CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " + "unspecified or specified once per top blob."; + for (int top_id = 0; top_id < top.size(); ++top_id) { + const Dtype loss_weight = layer_param_.loss_weight(top_id); + if (loss_weight == Dtype(0)) { + continue; + } + this->set_loss(top_id, loss_weight); + const int count = top[top_id]->count(); + Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); + caffe_set(count, loss_weight, loss_multiplier); + } + } + } - DISABLE_COPY_AND_ASSIGN (Layer); + DISABLE_COPY_AND_ASSIGN (Layer); }; // class Layer @@ -430,69 +427,68 @@ class Layer { // functions. template inline Dtype Layer::Forward(const vector*>& bottom, - const vector*>& top) { - Dtype loss = 0; - Reshape(bottom, top); - switch (Caffe::mode()) { - case Caffe::CPU: - Forward_cpu(bottom, top); - for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { - continue; - } - const int count = top[top_id]->count(); - const Dtype* data = top[top_id]->cpu_data(); - const Dtype* loss_weights = top[top_id]->cpu_diff(); - loss += caffe_cpu_dot(count, data, loss_weights); - } - break; - case Caffe::GPU: - Forward_gpu(bottom, top); + const vector*>& top) { + Dtype loss = 0; + Reshape(bottom, top); + switch (Caffe::mode()) { + case Caffe::CPU: + Forward_cpu(bottom, top); + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { + continue; + } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->cpu_data(); + const Dtype* loss_weights = top[top_id]->cpu_diff(); + loss += caffe_cpu_dot(count, data, loss_weights); + } + break; + case Caffe::GPU: + Forward_gpu(bottom, top); #ifndef CPU_ONLY - for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { - continue; - } - const int count = top[top_id]->count(); - const Dtype* data = top[top_id]->gpu_data(); - const Dtype* loss_weights = top[top_id]->gpu_diff(); - Dtype blob_loss = 0; - caffe_gpu_dot(count, data, loss_weights, &blob_loss); - loss += blob_loss; - } + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { + continue; + } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->gpu_data(); + const Dtype* loss_weights = top[top_id]->gpu_diff(); + Dtype blob_loss = 0; + caffe_gpu_dot(count, data, loss_weights, &blob_loss); + loss += blob_loss; + } #endif - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } - return loss; + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } + return loss; } template inline void Layer::Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - switch (Caffe::mode()) { - case Caffe::CPU: - Backward_cpu(top, propagate_down, bottom); - break; - case Caffe::GPU: - Backward_gpu(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } + const vector& propagate_down, const vector*>& bottom) { + switch (Caffe::mode()) { + case Caffe::CPU: + Backward_cpu(top, propagate_down, bottom); + break; + case Caffe::GPU: + Backward_gpu(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } } // Serialize LayerParameter to protocol buffer template void Layer::ToProto(LayerParameter* param, bool write_diff) { - param->Clear(); - param->CopyFrom(layer_param_); - param->clear_blobs(); - for (int i = 0; i < blobs_.size(); ++i) { - blobs_[i]->ToProto(param->add_blobs(), write_diff); - } + param->Clear(); + param->CopyFrom(layer_param_); + param->clear_blobs(); + for (int i = 0; i < blobs_.size(); ++i) { + blobs_[i]->ToProto(param->add_blobs(), write_diff); + } } } // namespace caffe diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp index b64b9eb2..6da8d315 100644 --- a/include/caffe/layer_factory.hpp +++ b/include/caffe/layer_factory.hpp @@ -52,61 +52,61 @@ class Layer; template class LayerRegistry { - public: - typedef shared_ptr > (*Creator)(const LayerParameter&); - typedef std::map CreatorRegistry; - - static CreatorRegistry& Registry() { - static CreatorRegistry* g_registry_ = new CreatorRegistry(); - return *g_registry_; - } - - // Adds a creator. - static void AddCreator(const string& type, Creator creator) { - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 0) - << "Layer type " << type << " already registered."; - registry[type] = creator; - } - - // Get a layer using a LayerParameter. - static shared_ptr > CreateLayer(const LayerParameter& param) { - LOG(INFO) << "Creating layer " << param.name(); - const string& type = param.type(); - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type - << " (known types: " << LayerTypeList() << ")"; - return registry[type](param); - } - - private: - // Layer registry should never be instantiated - everything is done with its - // static variables. - LayerRegistry() { - } - - static string LayerTypeList() { - CreatorRegistry& registry = Registry(); - string layer_types; - for (typename CreatorRegistry::iterator iter = registry.begin(); - iter != registry.end(); ++iter) { - if (iter != registry.begin()) { - layer_types += ", "; - } - layer_types += iter->first; - } - return layer_types; - } + public: + typedef shared_ptr > (*Creator)(const LayerParameter&); + typedef std::map CreatorRegistry; + + static CreatorRegistry& Registry() { + static CreatorRegistry* g_registry_ = new CreatorRegistry(); + return *g_registry_; + } + + // Adds a creator. + static void AddCreator(const string& type, Creator creator) { + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 0) << "Layer type " << type + << " already registered."; + registry[type] = creator; + } + + // Get a layer using a LayerParameter. + static shared_ptr > CreateLayer(const LayerParameter& param) { + LOG(INFO) << "Creating layer " << param.name(); + const string& type = param.type(); + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type + << " (known types: " << LayerTypeList() << ")"; + return registry[type](param); + } + + private: + // Layer registry should never be instantiated - everything is done with its + // static variables. + LayerRegistry() { + } + + static string LayerTypeList() { + CreatorRegistry& registry = Registry(); + string layer_types; + for (typename CreatorRegistry::iterator iter = registry.begin(); + iter != registry.end(); ++iter) { + if (iter != registry.begin()) { + layer_types += ", "; + } + layer_types += iter->first; + } + return layer_types; + } }; template class LayerRegisterer { - public: - LayerRegisterer(const string& type, - shared_ptr > (*creator)(const LayerParameter&)) { - // LOG(INFO) << "Registering layer type: " << type; - LayerRegistry::AddCreator(type, creator); - } + public: + LayerRegisterer(const string& type, + shared_ptr > (*creator)(const LayerParameter&)) { + // LOG(INFO) << "Registering layer type: " << type; + LayerRegistry::AddCreator(type, creator); + } }; #define REGISTER_LAYER_CREATOR(type, creator) \ diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 766645b5..431bd8ea 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -21,81 +21,80 @@ const float kLOG_THRESHOLD = 1e-20; */ template class AccuracyLayer: public Layer { - public: - /** - * @param param provides AccuracyParameter accuracy_param, - * with AccuracyLayer options: - * - top_k (\b optional, default 1). - * Sets the maximum rank @f$ k @f$ at which a prediction is considered - * correct. For example, if @f$ k = 5 @f$, a prediction is counted - * correct if the correct label is among the top 5 predicted labels. - */ - explicit AccuracyLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Accuracy"; - } - virtual inline int ExactNumBottomBlobs() const { - return 2; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - /** - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$, a Blob with values in - * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of - * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted - * label @f$ \hat{l}_n @f$ given by its maximal index: - * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels @f$ l @f$, an integer-valued Blob with values - * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ - * indicating the correct class label among the @f$ K @f$ classes - * @param top output Blob vector (length 1) - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * the computed accuracy: @f$ - * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} - * @f$, where @f$ - * \delta\{\mathrm{condition}\} = \left\{ - * \begin{array}{lr} - * 1 & \mbox{if condition} \\ + public: + /** + * @param param provides AccuracyParameter accuracy_param, + * with AccuracyLayer options: + * - top_k (\b optional, default 1). + * Sets the maximum rank @f$ k @f$ at which a prediction is considered + * correct. For example, if @f$ k = 5 @f$, a prediction is counted + * correct if the correct label is among the top 5 predicted labels. + */ + explicit AccuracyLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Accuracy"; + } + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$, a Blob with values in + * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of + * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted + * label @f$ \hat{l}_n @f$ given by its maximal index: + * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels @f$ l @f$, an integer-valued Blob with values + * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ + * indicating the correct class label among the @f$ K @f$ classes + * @param top output Blob vector (length 1) + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * the computed accuracy: @f$ + * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} + * @f$, where @f$ + * \delta\{\mathrm{condition}\} = \left\{ + * \begin{array}{lr} + * 1 & \mbox{if condition} \\ * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - for (int i = 0; i < propagate_down.size(); ++i) { - if (propagate_down[i]) { - NOT_IMPLEMENTED; - } - } - } - - int label_axis_, outer_num_, inner_num_; - - int top_k_; - - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < propagate_down.size(); ++i) { + if (propagate_down[i]) { + NOT_IMPLEMENTED; + } + } + } + + int label_axis_, outer_num_, inner_num_; + + int top_k_; + + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; }; /** @@ -108,39 +107,38 @@ class AccuracyLayer: public Layer { */ template class LossLayer: public Layer { - public: - explicit LossLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp( - const vector*>& bottom, const vector*>& top); - virtual void Reshape( - const vector*>& bottom, const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { - return 2; - } - - /** - * @brief For convenience and backwards compatibility, instruct the Net to - * automatically allocate a single top Blob for LossLayers, into which - * they output their singleton loss, (even if the user didn't specify - * one in the prototxt, etc.). - */ - virtual inline bool AutoTopBlobs() const { - return true; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - /** - * We usually cannot backpropagate to the labels; ignore force_backward for - * these inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 1; - } + public: + explicit LossLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + + /** + * @brief For convenience and backwards compatibility, instruct the Net to + * automatically allocate a single top Blob for LossLayers, into which + * they output their singleton loss, (even if the user didn't specify + * one in the prototxt, etc.). + */ + virtual inline bool AutoTopBlobs() const { + return true; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + /** + * We usually cannot backpropagate to the labels; ignore force_backward for + * these inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 1; + } }; /** @@ -169,69 +167,68 @@ class LossLayer: public Layer { */ template class ContrastiveLossLayer: public LossLayer { - public: - explicit ContrastiveLossLayer(const LayerParameter& param) - : - LossLayer(param), diff_() { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { - return 3; - } - virtual inline const char* type() const { - return "ContrastiveLoss"; - } - /** - * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate - * to the first two inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 2; - } - - protected: - /// @copydoc ContrastiveLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Contrastive error gradient w.r.t. the inputs. - * - * Computes the gradients with respect to the two input vectors (bottom[0] and - * bottom[1]), but not the similarity label (bottom[2]). - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$a@f$; Backward fills their diff with - * gradients if propagate_down[0] - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$b@f$; Backward fills their diff with gradients if - * propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; // cached for backward pass - Blob dist_sq_; // cached for backward pass - Blob diff_sq_; // tmp storage for gpu forward pass - Blob summer_vec_; // tmp storage for gpu forward pass + public: + explicit ContrastiveLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 3; + } + virtual inline const char* type() const { + return "ContrastiveLoss"; + } + /** + * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate + * to the first two inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 2; + } + + protected: + /// @copydoc ContrastiveLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Contrastive error gradient w.r.t. the inputs. + * + * Computes the gradients with respect to the two input vectors (bottom[0] and + * bottom[1]), but not the similarity label (bottom[2]). + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$a@f$; Backward fills their diff with + * gradients if propagate_down[0] + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$b@f$; Backward fills their diff with gradients if + * propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; // cached for backward pass + Blob dist_sq_; // cached for backward pass + Blob diff_sq_; // tmp storage for gpu forward pass + Blob summer_vec_; // tmp storage for gpu forward pass }; /** @@ -262,71 +259,70 @@ class ContrastiveLossLayer: public LossLayer { */ template class EuclideanLossLayer: public LossLayer { - public: - explicit EuclideanLossLayer(const LayerParameter& param) - : - LossLayer(param), diff_() { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "EuclideanLoss"; - } - /** - * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate - * to both inputs -- override to return true and always allow force_backward. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } - - protected: - /// @copydoc EuclideanLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Euclidean error gradient w.r.t. the inputs. - * - * Unlike other children of LossLayer, EuclideanLossLayer \b can compute - * gradients with respect to the label inputs bottom[1] (but still only will - * if propagate_down[1] is set, due to being produced by learnable parameters - * or if force_backward is set). In fact, this layer is "commutative" -- the - * result is the same regardless of the order of the two bottoms. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$\hat{y}@f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial \hat{y}} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) - * @f$ if propagate_down[0] - * -# @f$ (N \times C \times H \times W) @f$ - * the targets @f$y@f$; Backward fills their diff with gradients - * @f$ \frac{\partial E}{\partial y} = - * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) - * @f$ if propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; + public: + explicit EuclideanLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "EuclideanLoss"; + } + /** + * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate + * to both inputs -- override to return true and always allow force_backward. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; + } + + protected: + /// @copydoc EuclideanLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Euclidean error gradient w.r.t. the inputs. + * + * Unlike other children of LossLayer, EuclideanLossLayer \b can compute + * gradients with respect to the label inputs bottom[1] (but still only will + * if propagate_down[1] is set, due to being produced by learnable parameters + * or if force_backward is set). In fact, this layer is "commutative" -- the + * result is the same regardless of the order of the two bottoms. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$\hat{y}@f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial \hat{y}} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) + * @f$ if propagate_down[0] + * -# @f$ (N \times C \times H \times W) @f$ + * the targets @f$y@f$; Backward fills their diff with gradients + * @f$ \frac{\partial E}{\partial y} = + * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) + * @f$ if propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; }; /** @@ -374,50 +370,49 @@ class EuclideanLossLayer: public LossLayer { */ template class HingeLossLayer: public LossLayer { - public: - explicit HingeLossLayer(const LayerParameter& param) - : - LossLayer(param) { - } - - virtual inline const char* type() const { - return "HingeLoss"; - } - - protected: - /// @copydoc HingeLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the hinge loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$t@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial t} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit HingeLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + + virtual inline const char* type() const { + return "HingeLoss"; + } + + protected: + /// @copydoc HingeLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the hinge loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$t@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial t} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -454,74 +449,73 @@ class HingeLossLayer: public LossLayer { */ template class InfogainLossLayer: public LossLayer { - public: - explicit InfogainLossLayer(const LayerParameter& param) - : - LossLayer(param), infogain_() { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should - // be the infogain matrix. (Otherwise the infogain matrix is loaded from a - // file specified by LayerParameter.) - virtual inline int ExactNumBottomBlobs() const { - return -1; - } - virtual inline int MinBottomBlobs() const { - return 2; - } - virtual inline int MaxBottomBlobs() const { - return 3; - } - - virtual inline const char* type() const { - return "InfogainLoss"; - } - - protected: - /// @copydoc InfogainLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the infogain loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. (The same applies to the infogain matrix, if - * provided as bottom[2] rather than in the layer_param.) - * - * @param top output Blob vector (length 1), providing the error gradient - * with respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels (similarly for propagate_down[2] and the - * infogain matrix, if provided as bottom[2]) - * @param bottom input Blob vector (length 2-3) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - * -# @f$ (1 \times 1 \times K \times K) @f$ - * (\b optional) the information gain matrix -- ignored as its error - * gradient computation is not implemented. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob infogain_; + public: + explicit InfogainLossLayer(const LayerParameter& param) + : LossLayer(param), infogain_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should + // be the infogain matrix. (Otherwise the infogain matrix is loaded from a + // file specified by LayerParameter.) + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MaxBottomBlobs() const { + return 3; + } + + virtual inline const char* type() const { + return "InfogainLoss"; + } + + protected: + /// @copydoc InfogainLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the infogain loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. (The same applies to the infogain matrix, if + * provided as bottom[2] rather than in the layer_param.) + * + * @param top output Blob vector (length 1), providing the error gradient + * with respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels (similarly for propagate_down[2] and the + * infogain matrix, if provided as bottom[2]) + * @param bottom input Blob vector (length 2-3) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + * -# @f$ (1 \times 1 \times K \times K) @f$ + * (\b optional) the information gain matrix -- ignored as its error + * gradient computation is not implemented. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob infogain_; }; /** @@ -555,53 +549,52 @@ class InfogainLossLayer: public LossLayer { */ template class MultinomialLogisticLossLayer: public LossLayer { - public: - explicit MultinomialLogisticLossLayer(const LayerParameter& param) - : - LossLayer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "MultinomialLogisticLoss"; - } - - protected: - /// @copydoc MultinomialLogisticLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the multinomial logistic loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit MultinomialLogisticLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MultinomialLogisticLoss"; + } + + protected: + /// @copydoc MultinomialLogisticLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the multinomial logistic loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -635,70 +628,68 @@ class MultinomialLogisticLossLayer: public LossLayer { */ template class SigmoidCrossEntropyLossLayer: public LossLayer { - public: - explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) - : - LossLayer(param), - sigmoid_layer_(new SigmoidLayer(param)), - sigmoid_output_(new Blob()) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "SigmoidCrossEntropyLoss"; - } - - protected: - /// @copydoc SigmoidCrossEntropyLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the target inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as gradient computation with respect - * to the targets is not implemented. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$x@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) - * @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// The internal SigmoidLayer used to map predictions to probabilities. - shared_ptr > sigmoid_layer_; - /// sigmoid_output stores the output of the SigmoidLayer. - shared_ptr > sigmoid_output_; - /// bottom vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_bottom_vec_; - /// top vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_top_vec_; + public: + explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) + : LossLayer(param), sigmoid_layer_( + new SigmoidLayer(param)), sigmoid_output_(new Blob()) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SigmoidCrossEntropyLoss"; + } + + protected: + /// @copydoc SigmoidCrossEntropyLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the target inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as gradient computation with respect + * to the targets is not implemented. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$x@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) + * @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// The internal SigmoidLayer used to map predictions to probabilities. + shared_ptr > sigmoid_layer_; + /// sigmoid_output stores the output of the SigmoidLayer. + shared_ptr > sigmoid_output_; + /// bottom vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_bottom_vec_; + /// top vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_top_vec_; }; // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer. @@ -735,100 +726,99 @@ template class SoftmaxLayer; */ template class SoftmaxWithLossLayer: public LossLayer { - public: - /** - * @param param provides LossParameter loss_param, with options: - * - ignore_label (optional) - * Specify a label value that should be ignored when computing the loss. - * - normalize (optional, default true) - * If true, the loss is normalized by the number of (nonignored) labels - * present; otherwise the loss is simply summed over spatial locations. - */ - explicit SoftmaxWithLossLayer(const LayerParameter& param) - : - LossLayer(param) { - } - ~SoftmaxWithLossLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "SoftmaxWithLoss"; - } - virtual inline int ExactNumTopBlobs() const { - return -1; - } - virtual inline int MinTopBlobs() const { - return 1; - } - virtual inline int MaxTopBlobs() const { - return 2; - } - - protected: - /// @copydoc SoftmaxWithLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /** - * @brief Computes the softmax loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - void ocl_setup(); - - /// The internal SoftmaxLayer used to map predictions to a distribution. - shared_ptr > softmax_layer_; - /// prob stores the output probability predictions from the SoftmaxLayer. - Blob prob_; - /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_bottom_vec_; - /// top vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_top_vec_; - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; - /// Whether to normalize the loss by the total number of values present - /// (otherwise just by the batch size). - bool normalize_; - - int softmax_axis_, outer_num_, inner_num_; - - protected: - cl_kernel diff_kernel, scal_kernel, softmax_kernel; - cl_mem d_loss; - cl_kernel softmax_loss_fp_kernel; - cl_kernel softmax_loss_bp_kernel; + public: + /** + * @param param provides LossParameter loss_param, with options: + * - ignore_label (optional) + * Specify a label value that should be ignored when computing the loss. + * - normalize (optional, default true) + * If true, the loss is normalized by the number of (nonignored) labels + * present; otherwise the loss is simply summed over spatial locations. + */ + explicit SoftmaxWithLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + ~SoftmaxWithLossLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SoftmaxWithLoss"; + } + virtual inline int ExactNumTopBlobs() const { + return -1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + /// @copydoc SoftmaxWithLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /** + * @brief Computes the softmax loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + void ocl_setup(); + + /// The internal SoftmaxLayer used to map predictions to a distribution. + shared_ptr > softmax_layer_; + /// prob stores the output probability predictions from the SoftmaxLayer. + Blob prob_; + /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_bottom_vec_; + /// top vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_top_vec_; + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; + /// Whether to normalize the loss by the total number of values present + /// (otherwise just by the batch size). + bool normalize_; + + int softmax_axis_, outer_num_, inner_num_; + + protected: + cl_kernel diff_kernel, scal_kernel, softmax_kernel; + cl_mem d_loss; + cl_kernel softmax_loss_fp_kernel; + cl_kernel softmax_loss_bp_kernel; }; } // namespace caffe diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 2fe273f5..bbd61b88 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -22,264 +22,264 @@ namespace caffe { */ template class Net { - public: - explicit Net(const NetParameter& param); - explicit Net(const string& param_file, Phase phase); - virtual ~Net() { - } + public: + explicit Net(const NetParameter& param); + explicit Net(const string& param_file, Phase phase); + virtual ~Net() { + } - /// @brief Initialize a network with a NetParameter. - void Init(const NetParameter& param); + /// @brief Initialize a network with a NetParameter. + void Init(const NetParameter& param); - /** - * @brief Run Forward with the input Blob%s already fed separately. - * - * You can get the input blobs using input_blobs(). - */ - const vector*>& ForwardPrefilled(Dtype* loss = NULL); + /** + * @brief Run Forward with the input Blob%s already fed separately. + * + * You can get the input blobs using input_blobs(). + */ + const vector*>& ForwardPrefilled(Dtype* loss = NULL); - /** - * The From and To variants of Forward and Backward operate on the - * (topological) ordering by which the net is specified. For general DAG - * networks, note that (1) computing from one layer to another might entail - * extra computation on unrelated branches, and (2) computation starting in - * the middle may be incorrect if all of the layers of a fan-in are not - * included. - */ - Dtype ForwardFromTo(int start, int end); - Dtype ForwardFrom(int start); - Dtype ForwardTo(int end); - /// @brief Run forward using a set of bottom blobs, and return the result. - const vector*>& Forward(const vector*> & bottom, - Dtype* loss = NULL); - /** - * @brief Run forward using a serialized BlobProtoVector and return the - * result as a serialized BlobProtoVector - */ - string Forward(const string& input_blob_protos, Dtype* loss = NULL); + /** + * The From and To variants of Forward and Backward operate on the + * (topological) ordering by which the net is specified. For general DAG + * networks, note that (1) computing from one layer to another might entail + * extra computation on unrelated branches, and (2) computation starting in + * the middle may be incorrect if all of the layers of a fan-in are not + * included. + */ + Dtype ForwardFromTo(int start, int end); + Dtype ForwardFrom(int start); + Dtype ForwardTo(int end); + /// @brief Run forward using a set of bottom blobs, and return the result. + const vector*>& Forward(const vector*> & bottom, + Dtype* loss = NULL); + /** + * @brief Run forward using a serialized BlobProtoVector and return the + * result as a serialized BlobProtoVector + */ + string Forward(const string& input_blob_protos, Dtype* loss = NULL); - /** - * The network backward should take no input and output, since it solely - * computes the gradient w.r.t the parameters, and the data has already been - * provided during the forward pass. - */ - void Backward(); - void BackwardFromTo(int start, int end); - void BackwardFrom(int start); - void BackwardTo(int end); + /** + * The network backward should take no input and output, since it solely + * computes the gradient w.r.t the parameters, and the data has already been + * provided during the forward pass. + */ + void Backward(); + void BackwardFromTo(int start, int end); + void BackwardFrom(int start); + void BackwardTo(int end); - /** - * @brief Reshape all layers from bottom to top. - * - * This is useful to propagate changes to layer sizes without running - * a forward pass, e.g. to compute output feature size. - */ - void Reshape(); + /** + * @brief Reshape all layers from bottom to top. + * + * This is useful to propagate changes to layer sizes without running + * a forward pass, e.g. to compute output feature size. + */ + void Reshape(); - Dtype ForwardBackward(const vector*> & bottom) { - Dtype loss; - Forward(bottom, &loss); - Backward(); - return loss; - } + Dtype ForwardBackward(const vector*> & bottom) { + Dtype loss; + Forward(bottom, &loss); + Backward(); + return loss; + } - /// @brief Updates the network weights based on the diff values computed. - void Update(); + /// @brief Updates the network weights based on the diff values computed. + void Update(); - /** - * @brief For an already initialized net, implicitly copies (i.e., using no - * additional memory) the pre-trained layers from another Net. - */ - void ShareTrainedLayersWith(const Net* other); - // For an already initialized net, CopyTrainedLayersFrom() copies the already - // trained layers from another net parameter instance. - /** - * @brief For an already initialized net, copies the pre-trained layers from - * another Net. - */ - void CopyTrainedLayersFrom(const NetParameter& param); - void CopyTrainedLayersFrom(const string trained_filename); - /// @brief Writes the net to a proto. - void ToProto(NetParameter* param, bool write_diff = false) const; + /** + * @brief For an already initialized net, implicitly copies (i.e., using no + * additional memory) the pre-trained layers from another Net. + */ + void ShareTrainedLayersWith(const Net* other); + // For an already initialized net, CopyTrainedLayersFrom() copies the already + // trained layers from another net parameter instance. + /** + * @brief For an already initialized net, copies the pre-trained layers from + * another Net. + */ + void CopyTrainedLayersFrom(const NetParameter& param); + void CopyTrainedLayersFrom(const string trained_filename); + /// @brief Writes the net to a proto. + void ToProto(NetParameter* param, bool write_diff = false) const; - /// @brief returns the network name. - inline const string& name() const { - return name_; - } - /// @brief returns the layer names - inline const vector& layer_names() const { - return layer_names_; - } - /// @brief returns the blob names - inline const vector& blob_names() const { - return blob_names_; - } - /// @brief returns the blobs - inline const vector > >& blobs() const { - return blobs_; - } - /// @brief returns the layers - inline const vector > >& layers() const { - return layers_; - } - /// @brief returns the phase: TRAIN or TEST - inline Phase phase() const { - return phase_; - } - /** - * @brief returns the bottom vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& bottom_vecs() const { - return bottom_vecs_; - } - /** - * @brief returns the top vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& top_vecs() const { - return top_vecs_; - } - inline const vector >& bottom_need_backward() const { - return bottom_need_backward_; - } - inline const vector& blob_loss_weights() const { - return blob_loss_weights_; - } - inline const vector& layer_need_backward() const { - return layer_need_backward_; - } - /// @brief returns the parameters - inline const vector > >& params() const { - return params_; - } - /// @brief returns the parameter learning rate multipliers - inline const vector& params_lr() const { - return params_lr_; - } - inline const vector& params_weight_decay() const { - return params_weight_decay_; - } - const map& param_names_index() const { - return param_names_index_; - } - inline const vector& param_owners() const { - return param_owners_; - } - /// @brief Input and output blob numbers - inline int num_inputs() const { - return net_input_blobs_.size(); - } - inline int num_outputs() const { - return net_output_blobs_.size(); - } - inline const vector*>& input_blobs() const { - return net_input_blobs_; - } - inline const vector*>& output_blobs() const { - return net_output_blobs_; - } - inline const vector& input_blob_indices() const { - return net_input_blob_indices_; - } - inline const vector& output_blob_indices() const { - return net_output_blob_indices_; - } - bool has_blob(const string& blob_name) const; - const shared_ptr > blob_by_name(const string& blob_name) const; - bool has_layer(const string& layer_name) const; - const shared_ptr > layer_by_name( - const string& layer_name) const; + /// @brief returns the network name. + inline const string& name() const { + return name_; + } + /// @brief returns the layer names + inline const vector& layer_names() const { + return layer_names_; + } + /// @brief returns the blob names + inline const vector& blob_names() const { + return blob_names_; + } + /// @brief returns the blobs + inline const vector > >& blobs() const { + return blobs_; + } + /// @brief returns the layers + inline const vector > >& layers() const { + return layers_; + } + /// @brief returns the phase: TRAIN or TEST + inline Phase phase() const { + return phase_; + } + /** + * @brief returns the bottom vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& bottom_vecs() const { + return bottom_vecs_; + } + /** + * @brief returns the top vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& top_vecs() const { + return top_vecs_; + } + inline const vector >& bottom_need_backward() const { + return bottom_need_backward_; + } + inline const vector& blob_loss_weights() const { + return blob_loss_weights_; + } + inline const vector& layer_need_backward() const { + return layer_need_backward_; + } + /// @brief returns the parameters + inline const vector > >& params() const { + return params_; + } + /// @brief returns the parameter learning rate multipliers + inline const vector& params_lr() const { + return params_lr_; + } + inline const vector& params_weight_decay() const { + return params_weight_decay_; + } + const map& param_names_index() const { + return param_names_index_; + } + inline const vector& param_owners() const { + return param_owners_; + } + /// @brief Input and output blob numbers + inline int num_inputs() const { + return net_input_blobs_.size(); + } + inline int num_outputs() const { + return net_output_blobs_.size(); + } + inline const vector*>& input_blobs() const { + return net_input_blobs_; + } + inline const vector*>& output_blobs() const { + return net_output_blobs_; + } + inline const vector& input_blob_indices() const { + return net_input_blob_indices_; + } + inline const vector& output_blob_indices() const { + return net_output_blob_indices_; + } + bool has_blob(const string& blob_name) const; + const shared_ptr > blob_by_name(const string& blob_name) const; + bool has_layer(const string& layer_name) const; + const shared_ptr > layer_by_name( + const string& layer_name) const; - void set_debug_info(const bool value) { - debug_info_ = value; - } + void set_debug_info(const bool value) { + debug_info_ = value; + } - // Helpers for Init. - /** - * @brief Remove layers that the user specified should be excluded given the current - * phase, level, and stage. - */ - static void FilterNet(const NetParameter& param, - NetParameter* param_filtered); - /// @brief return whether NetState state meets NetStateRule rule - static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, - const string& layer_name); + // Helpers for Init. + /** + * @brief Remove layers that the user specified should be excluded given the current + * phase, level, and stage. + */ + static void FilterNet(const NetParameter& param, + NetParameter* param_filtered); + /// @brief return whether NetState state meets NetStateRule rule + static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, + const string& layer_name); - protected: - // Helpers for Init. - /// @brief Append a new input or top blob to the net. - void AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new bottom blob to the net. - int AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new parameter blob to the net. - void AppendParam(const NetParameter& param, const int layer_id, - const int param_id); + protected: + // Helpers for Init. + /// @brief Append a new input or top blob to the net. + void AppendTop(const NetParameter& param, const int layer_id, + const int top_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new bottom blob to the net. + int AppendBottom(const NetParameter& param, const int layer_id, + const int bottom_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new parameter blob to the net. + void AppendParam(const NetParameter& param, const int layer_id, + const int param_id); - /// @brief Helper for displaying debug info in Forward about input Blobs. - void InputDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Forward. - void ForwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Backward. - void BackwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Update. - void UpdateDebugInfo(const int param_id); + /// @brief Helper for displaying debug info in Forward about input Blobs. + void InputDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Forward. + void ForwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Backward. + void BackwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Update. + void UpdateDebugInfo(const int param_id); - /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. - void GetLearningRateAndWeightDecay(); + /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. + void GetLearningRateAndWeightDecay(); - /// @brief The network name - string name_; - /// @brief The phase: TRAIN or TEST - Phase phase_; - /// @brief Individual layers in the net - vector > > layers_; - vector layer_names_; - map layer_names_index_; - vector layer_need_backward_; - /// @brief the blobs storing intermediate results between the layer. - vector > > blobs_; - vector blob_names_; - map blob_names_index_; - vector blob_need_backward_; - /// bottom_vecs stores the vectors containing the input for each layer. - /// They don't actually host the blobs (blobs_ does), so we simply store - /// pointers. - vector*> > bottom_vecs_; - vector > bottom_id_vecs_; - vector > bottom_need_backward_; - /// top_vecs stores the vectors containing the output for each layer - vector*> > top_vecs_; - vector > top_id_vecs_; - /// Vector of weight in the loss (or objective) function of each net blob, - /// indexed by blob_id. - vector blob_loss_weights_; - vector > param_id_vecs_; - vector param_owners_; - vector param_display_names_; - vector > param_layer_indices_; - map param_names_index_; - /// blob indices for the input and the output of the net - vector net_input_blob_indices_; - vector net_output_blob_indices_; - vector*> net_input_blobs_; - vector*> net_output_blobs_; - /// The parameters in the network. - vector > > params_; - /// the learning rate multipliers - vector params_lr_; - /// the weight decay multipliers - vector params_weight_decay_; - /// The bytes of memory used by this net - size_t memory_used_; - /// Whether to compute and display debug info for the net. - bool debug_info_; + /// @brief The network name + string name_; + /// @brief The phase: TRAIN or TEST + Phase phase_; + /// @brief Individual layers in the net + vector > > layers_; + vector layer_names_; + map layer_names_index_; + vector layer_need_backward_; + /// @brief the blobs storing intermediate results between the layer. + vector > > blobs_; + vector blob_names_; + map blob_names_index_; + vector blob_need_backward_; + /// bottom_vecs stores the vectors containing the input for each layer. + /// They don't actually host the blobs (blobs_ does), so we simply store + /// pointers. + vector*> > bottom_vecs_; + vector > bottom_id_vecs_; + vector > bottom_need_backward_; + /// top_vecs stores the vectors containing the output for each layer + vector*> > top_vecs_; + vector > top_id_vecs_; + /// Vector of weight in the loss (or objective) function of each net blob, + /// indexed by blob_id. + vector blob_loss_weights_; + vector > param_id_vecs_; + vector param_owners_; + vector param_display_names_; + vector > param_layer_indices_; + map param_names_index_; + /// blob indices for the input and the output of the net + vector net_input_blob_indices_; + vector net_output_blob_indices_; + vector*> net_input_blobs_; + vector*> net_output_blobs_; + /// The parameters in the network. + vector > > params_; + /// the learning rate multipliers + vector params_lr_; + /// the weight decay multipliers + vector params_weight_decay_; + /// The bytes of memory used by this net + size_t memory_used_; + /// Whether to compute and display debug info for the net. + bool debug_info_; - DISABLE_COPY_AND_ASSIGN (Net); + DISABLE_COPY_AND_ASSIGN (Net); }; } // namespace caffe diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 89b6c481..54267f12 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -24,20 +24,19 @@ namespace caffe { */ template class NeuronLayer: public Layer { - public: - explicit NeuronLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } + public: + explicit NeuronLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } }; /** @@ -52,52 +51,51 @@ class NeuronLayer: public Layer { */ template class AbsValLayer: public NeuronLayer { - public: - explicit AbsValLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "AbsVal"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - /// @copydoc AbsValLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the absolute value inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \mathrm{sign}(x) \frac{\partial E}{\partial y} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit AbsValLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "AbsVal"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /// @copydoc AbsValLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the absolute value inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \mathrm{sign}(x) \frac{\partial E}{\partial y} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -119,43 +117,42 @@ class AbsValLayer: public NeuronLayer { */ template class BNLLLayer: public NeuronLayer { - public: - explicit BNLLLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - - virtual inline const char* type() const { - return "BNLL"; - } - - protected: - /// @copydoc BNLLLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the BNLL inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit BNLLLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "BNLL"; + } + + protected: + /// @copydoc BNLLLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the BNLL inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -171,65 +168,64 @@ class BNLLLayer: public NeuronLayer { */ template class DropoutLayer: public NeuronLayer { - public: - /** - * @param param provides DropoutParameter dropout_param, - * with DropoutLayer options: - * - dropout_ratio (\b optional, default 0.5). - * Sets the probability @f$ p @f$ that any given unit is dropped. - */ - explicit DropoutLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Dropout"; - } - virtual ~DropoutLayer(); - void ocl_setup(int bottom_count); - cl_mem MaskMem; - cl_kernel ocl_Kernel_Fwd; - cl_kernel ocl_Kernel_Bwd; - cl_kernel rng_kernel; - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs. At training time, we have @f$ - * y_{\mbox{train}} = \left\{ - * \begin{array}{ll} - * \frac{x}{1 - p} & \mbox{if } u > p \\ + public: + /** + * @param param provides DropoutParameter dropout_param, + * with DropoutLayer options: + * - dropout_ratio (\b optional, default 0.5). + * Sets the probability @f$ p @f$ that any given unit is dropped. + */ + explicit DropoutLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Dropout"; + } + virtual ~DropoutLayer(); + void ocl_setup(int bottom_count); + cl_mem MaskMem; + cl_kernel ocl_Kernel_Fwd; + cl_kernel ocl_Kernel_Bwd; + cl_kernel rng_kernel; + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs. At training time, we have @f$ + * y_{\mbox{train}} = \left\{ + * \begin{array}{ll} + * \frac{x}{1 - p} & \mbox{if } u > p \\ * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each - * input at each iteration. At test time, we simply have - * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ - Blob rand_vec_; - /// the probability @f$ p @f$ of dropping any input - Dtype threshold_; - /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ - Dtype scale_; - unsigned int uint_thres_; + * \end{array} \right. + * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each + * input at each iteration. At test time, we simply have + * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ + Blob rand_vec_; + /// the probability @f$ p @f$ of dropping any input + Dtype threshold_; + /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ + Dtype scale_; + unsigned int uint_thres_; }; /** @@ -239,65 +235,64 @@ class DropoutLayer: public NeuronLayer { */ template class ExpLayer: public NeuronLayer { - public: - /** - * @param param provides ExpParameter exp_param, - * with ExpLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit ExpLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Exp"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \gamma ^ {\alpha x + \beta} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype inner_scale_, outer_scale_; + public: + /** + * @param param provides ExpParameter exp_param, + * with ExpLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit ExpLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Exp"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \gamma ^ {\alpha x + \beta} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype inner_scale_, outer_scale_; }; /** @@ -307,67 +302,66 @@ class ExpLayer: public NeuronLayer { */ template class LogLayer: public NeuronLayer { - public: - /** - * @param param provides LogParameter log_param, - * with LogLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit LogLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Log"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = log_{\gamma}(\alpha x + \beta) - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype base_scale_; - Dtype input_scale_, input_shift_; - Dtype backward_num_scale_; + public: + /** + * @param param provides LogParameter log_param, + * with LogLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit LogLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Log"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = log_{\gamma}(\alpha x + \beta) + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype base_scale_; + Dtype input_scale_, input_shift_; + Dtype backward_num_scale_; }; /** @@ -377,74 +371,73 @@ class LogLayer: public NeuronLayer { */ template class PowerLayer: public NeuronLayer { - public: - /** - * @param param provides PowerParameter power_param, - * with PowerLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - power (\b optional, default 1) the power @f$ \gamma @f$ - */ - explicit PowerLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Power"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (\alpha x + \beta) ^ \gamma - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the power inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} - * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = - * \frac{\partial E}{\partial y} - * \frac{\alpha \gamma y}{\alpha x + \beta} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief @f$ \gamma @f$ from layer_param_.power_param() - Dtype power_; - /// @brief @f$ \alpha @f$ from layer_param_.power_param() - Dtype scale_; - /// @brief @f$ \beta @f$ from layer_param_.power_param() - Dtype shift_; - /// @brief Result of @f$ \alpha \gamma @f$ - Dtype diff_scale_; + public: + /** + * @param param provides PowerParameter power_param, + * with PowerLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - power (\b optional, default 1) the power @f$ \gamma @f$ + */ + explicit PowerLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Power"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (\alpha x + \beta) ^ \gamma + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the power inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} + * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = + * \frac{\partial E}{\partial y} + * \frac{\alpha \gamma y}{\alpha x + \beta} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief @f$ \gamma @f$ from layer_param_.power_param() + Dtype power_; + /// @brief @f$ \alpha @f$ from layer_param_.power_param() + Dtype scale_; + /// @brief @f$ \beta @f$ from layer_param_.power_param() + Dtype shift_; + /// @brief Result of @f$ \alpha \gamma @f$ + Dtype diff_scale_; }; /** @@ -453,70 +446,69 @@ class PowerLayer: public NeuronLayer { */ template class ReLULayer: public NeuronLayer { - public: - /** - * @param param provides ReLUParameter relu_param, - * with ReLULayer options: - * - negative_slope (\b optional, default 0). - * the value @f$ \nu @f$ by which negative values are multiplied. - */ - explicit ReLULayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual inline const char* type() const { - return "ReLU"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \max(0, x) - * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the ReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le 0 \\ + public: + /** + * @param param provides ReLUParameter relu_param, + * with ReLULayer options: + * - negative_slope (\b optional, default 0). + * the value @f$ \nu @f$ by which negative values are multiplied. + */ + explicit ReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual inline const char* type() const { + return "ReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \max(0, x) + * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the ReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$ if propagate_down[0], by default. - * If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed gradients are @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ + * \end{array} \right. + * @f$ if propagate_down[0], by default. + * If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed gradients are @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -525,25 +517,25 @@ class ReLULayer: public NeuronLayer { */ template class CuDNNReLULayer : public ReLULayer { - public: - explicit CuDNNReLULayer(const LayerParameter& param) - : ReLULayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNReLULayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNReLULayer(const LayerParameter& param) + : ReLULayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNReLULayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -557,53 +549,52 @@ class CuDNNReLULayer : public ReLULayer { */ template class SigmoidLayer: public NeuronLayer { - public: - explicit SigmoidLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - - virtual inline const char* type() const { - return "Sigmoid"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (1 + \exp(-x))^{-1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} y (1 - y) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit SigmoidLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "Sigmoid"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (1 + \exp(-x))^{-1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} y (1 - y) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -612,25 +603,25 @@ class SigmoidLayer: public NeuronLayer { */ template class CuDNNSigmoidLayer : public SigmoidLayer { - public: - explicit CuDNNSigmoidLayer(const LayerParameter& param) - : SigmoidLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNSigmoidLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNSigmoidLayer(const LayerParameter& param) + : SigmoidLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNSigmoidLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -644,55 +635,54 @@ class CuDNNSigmoidLayer : public SigmoidLayer { */ template class TanHLayer: public NeuronLayer { - public: - explicit TanHLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - - virtual inline const char* type() const { - return "TanH"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} - * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) - * = \frac{\partial E}{\partial y} (1 - y^2) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + explicit TanHLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "TanH"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} + * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) + * = \frac{\partial E}{\partial y} (1 - y^2) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -701,25 +691,25 @@ class TanHLayer: public NeuronLayer { */ template class CuDNNTanHLayer : public TanHLayer { - public: - explicit CuDNNTanHLayer(const LayerParameter& param) - : TanHLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNTanHLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_; - cudnnTensorDescriptor_t top_desc_; + public: + explicit CuDNNTanHLayer(const LayerParameter& param) + : TanHLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNTanHLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_; + cudnnTensorDescriptor_t top_desc_; }; #endif @@ -729,51 +719,50 @@ class CuDNNTanHLayer : public TanHLayer { */ template class ThresholdLayer: public NeuronLayer { - public: - /** - * @param param provides ThresholdParameter threshold_param, - * with ThresholdLayer options: - * - threshold (\b optional, default 0). - * the threshold value @f$ t @f$ to which the input values are compared. - */ - explicit ThresholdLayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Threshold"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le t \\ + public: + /** + * @param param provides ThresholdParameter threshold_param, + * with ThresholdLayer options: + * - threshold (\b optional, default 0). + * the threshold value @f$ t @f$ to which the input values are compared. + */ + explicit ThresholdLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Threshold"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le t \\ * 1 & \mathrm{if} \; x > t - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - NOT_IMPLEMENTED; - } - - Dtype threshold_; + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + + Dtype threshold_; }; /** @@ -786,83 +775,82 @@ class ThresholdLayer: public NeuronLayer { */ template class PReLULayer: public NeuronLayer { - public: - /** - * @param param provides PReLUParameter prelu_param, - * with PReLULayer options: - * - filler (\b optional, FillerParameter, - * default {'type': constant 'value':0.25}). - * - channel_shared (\b optional, default false). - * negative slopes are shared across channels. - */ - explicit PReLULayer(const LayerParameter& param) - : - NeuronLayer(param) { - } - - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "PReLU"; - } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the computed outputs for each channel @f$i@f$ @f$ - * y_i = \max(0, x_i) + a_i \min(0, x_i) - * @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the PReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times ...) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their - * diff with gradients @f$ - * \frac{\partial E}{\partial x_i} = \left\{ - * \begin{array}{lr} - * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ + public: + /** + * @param param provides PReLUParameter prelu_param, + * with PReLULayer options: + * - filler (\b optional, FillerParameter, + * default {'type': constant 'value':0.25}). + * - channel_shared (\b optional, default false). + * negative slopes are shared across channels. + */ + explicit PReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "PReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the computed outputs for each channel @f$i@f$ @f$ + * y_i = \max(0, x_i) + a_i \min(0, x_i) + * @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the PReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times ...) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their + * diff with gradients @f$ + * \frac{\partial E}{\partial x_i} = \left\{ + * \begin{array}{lr} + * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - * If param_propagate_down_[0] is true, it fills the diff with gradients - * @f$ - * \frac{\partial E}{\partial a_i} = \left\{ - * \begin{array}{lr} - * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ + * \end{array} \right. + * @f$. + * If param_propagate_down_[0] is true, it fills the diff with gradients + * @f$ + * \frac{\partial E}{\partial a_i} = \left\{ + * \begin{array}{lr} + * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * 0 & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool channel_shared_; - Blob multiplier_; // dot multiplier for backward computation of params - Blob backward_buff_; // temporary buffer for backward computation - Blob bottom_memory_; // memory for in-place computation + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool channel_shared_; + Blob multiplier_; // dot multiplier for backward computation of params + Blob backward_buff_; // temporary buffer for backward computation + Blob bottom_memory_; // memory for in-place computation }; } // namespace caffe diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp index 41e2c21a..16d1f7fc 100644 --- a/include/caffe/python_layer.hpp +++ b/include/caffe/python_layer.hpp @@ -12,59 +12,58 @@ namespace caffe { template class PythonLayer: public Layer { - public: - PythonLayer(PyObject* self, const LayerParameter& param) - : - Layer(param), self_(bp::handle<>(bp::borrowed(self))) { - } + public: + PythonLayer(PyObject* self, const LayerParameter& param) + : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { + } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("setup")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("setup")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("reshape")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("reshape")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - virtual inline const char* type() const { - return "Python"; - } + virtual inline const char* type() const { + return "Python"; + } - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("forward")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - try { - self_.attr("backward")(top, propagate_down, bottom); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } - } + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("forward")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + try { + self_.attr("backward")(top, propagate_down, bottom); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } + } - private: - bp::object self_; + private: + bp::object self_; }; } // namespace caffe diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 60dbc5b0..2bddb77f 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -16,62 +16,62 @@ namespace caffe { */ template class Solver { - public: - explicit Solver(const SolverParameter& param); - explicit Solver(const string& param_file); - void Init(const SolverParameter& param); - void InitTrainNet(); - void InitTestNets(); - // The main entry of the solver function. In default, iter will be zero. Pass - // in a non-zero iter number to resume training for a pre-trained net. - virtual void Solve(const char* resume_file = NULL); - inline void Solve(const string resume_file) { - Solve(resume_file.c_str()); - } - void Step(int iters); - // The Restore function implements how one should restore the solver to a - // previously snapshotted state. You should implement the RestoreSolverState() - // function that restores the state from a SolverState protocol buffer. - void Restore(const char* resume_file); - virtual ~Solver() { - } - inline shared_ptr > net() { - return net_; - } - inline const vector > >& test_nets() { - return test_nets_; - } - int iter() { - return iter_; - } - - protected: - // Make and apply the update value for the current iteration. - virtual void ApplyUpdate() = 0; - // The Solver::Snapshot function implements the basic snapshotting utility - // that stores the learned net. You should implement the SnapshotSolverState() - // function that produces a SolverState protocol buffer that needs to be - // written to disk together with the learned net. - void Snapshot(); - // The test routine - void TestAll(); - void Test(const int test_net_id = 0); - virtual void SnapshotSolverState(SolverState* state) = 0; - virtual void RestoreSolverState(const SolverState& state) = 0; - - void DisplayOutputBlobs(const int net_id); - - SolverParameter param_; - int iter_; - int current_step_; - shared_ptr > net_; - vector > > test_nets_; - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - - DISABLE_COPY_AND_ASSIGN (Solver); + public: + explicit Solver(const SolverParameter& param); + explicit Solver(const string& param_file); + void Init(const SolverParameter& param); + void InitTrainNet(); + void InitTestNets(); + // The main entry of the solver function. In default, iter will be zero. Pass + // in a non-zero iter number to resume training for a pre-trained net. + virtual void Solve(const char* resume_file = NULL); + inline void Solve(const string resume_file) { + Solve(resume_file.c_str()); + } + void Step(int iters); + // The Restore function implements how one should restore the solver to a + // previously snapshotted state. You should implement the RestoreSolverState() + // function that restores the state from a SolverState protocol buffer. + void Restore(const char* resume_file); + virtual ~Solver() { + } + inline shared_ptr > net() { + return net_; + } + inline const vector > >& test_nets() { + return test_nets_; + } + int iter() { + return iter_; + } + + protected: + // Make and apply the update value for the current iteration. + virtual void ApplyUpdate() = 0; + // The Solver::Snapshot function implements the basic snapshotting utility + // that stores the learned net. You should implement the SnapshotSolverState() + // function that produces a SolverState protocol buffer that needs to be + // written to disk together with the learned net. + void Snapshot(); + // The test routine + void TestAll(); + void Test(const int test_net_id = 0); + virtual void SnapshotSolverState(SolverState* state) = 0; + virtual void RestoreSolverState(const SolverState& state) = 0; + + void DisplayOutputBlobs(const int net_id); + + SolverParameter param_; + int iter_; + int current_step_; + shared_ptr > net_; + vector > > test_nets_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (Solver); }; /** @@ -80,109 +80,103 @@ class Solver { */ template class SGDSolver: public Solver { - public: - explicit SGDSolver(const SolverParameter& param) - : - Solver(param) { - PreSolve(); - } - explicit SGDSolver(const string& param_file) - : - Solver(param_file) { - PreSolve(); - } - - const vector > >& history() { - return history_; - } - - protected: - void PreSolve(); - Dtype GetLearningRate(); - virtual void ApplyUpdate(); - virtual void Normalize(int param_id); - virtual void Regularize(int param_id); - virtual void ComputeUpdateValue(int param_id, Dtype rate); - virtual void ClipGradients(); - virtual void SnapshotSolverState(SolverState * state); - virtual void RestoreSolverState(const SolverState& state); - // history maintains the historical momentum data. - // update maintains update related data and is not needed in snapshots. - // temp maintains other information that might be needed in computation - // of gradients/updates and is not needed in snapshots - vector > > history_, update_, temp_; - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - - DISABLE_COPY_AND_ASSIGN (SGDSolver); + public: + explicit SGDSolver(const SolverParameter& param) + : Solver(param) { + PreSolve(); + } + explicit SGDSolver(const string& param_file) + : Solver(param_file) { + PreSolve(); + } + + const vector > >& history() { + return history_; + } + + protected: + void PreSolve(); + Dtype GetLearningRate(); + virtual void ApplyUpdate(); + virtual void Normalize(int param_id); + virtual void Regularize(int param_id); + virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ClipGradients(); + virtual void SnapshotSolverState(SolverState * state); + virtual void RestoreSolverState(const SolverState& state); + // history maintains the historical momentum data. + // update maintains update related data and is not needed in snapshots. + // temp maintains other information that might be needed in computation + // of gradients/updates and is not needed in snapshots + vector > > history_, update_, temp_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (SGDSolver); }; template class NesterovSolver: public SGDSolver { - public: - explicit NesterovSolver(const SolverParameter& param) - : - SGDSolver(param) { - } - explicit NesterovSolver(const string& param_file) - : - SGDSolver(param_file) { - } - - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - - DISABLE_COPY_AND_ASSIGN (NesterovSolver); + public: + explicit NesterovSolver(const SolverParameter& param) + : SGDSolver(param) { + } + explicit NesterovSolver(const string& param_file) + : SGDSolver(param_file) { + } + + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (NesterovSolver); }; template class AdaGradSolver: public SGDSolver { - public: - explicit AdaGradSolver(const SolverParameter& param) - : - SGDSolver(param) { - constructor_sanity_check(); - } - explicit AdaGradSolver(const string& param_file) - : - SGDSolver(param_file) { - constructor_sanity_check(); - } - - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); - void constructor_sanity_check() { - CHECK_EQ(0, this->param_.momentum()) - << "Momentum cannot be used with AdaGrad."; - } - - void ocl_setup(); - protected: - cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; - DISABLE_COPY_AND_ASSIGN (AdaGradSolver); + public: + explicit AdaGradSolver(const SolverParameter& param) + : SGDSolver(param) { + constructor_sanity_check(); + } + explicit AdaGradSolver(const string& param_file) + : SGDSolver(param_file) { + constructor_sanity_check(); + } + + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + void constructor_sanity_check() { + CHECK_EQ(0, this->param_.momentum()) + << "Momentum cannot be used with AdaGrad."; + } + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + DISABLE_COPY_AND_ASSIGN (AdaGradSolver); }; template Solver* GetSolver(const SolverParameter& param) { - SolverParameter_SolverType type = param.solver_type(); - - switch (type) { - case SolverParameter_SolverType_SGD: - return new SGDSolver(param); - case SolverParameter_SolverType_NESTEROV: - return new NesterovSolver(param); - case SolverParameter_SolverType_ADAGRAD: - return new AdaGradSolver(param); - default: - LOG(FATAL) << "Unknown SolverType: " << type; - } - return (Solver*) NULL; + SolverParameter_SolverType type = param.solver_type(); + + switch (type) { + case SolverParameter_SolverType_SGD: + return new SGDSolver(param); + case SolverParameter_SolverType_NESTEROV: + return new NesterovSolver(param); + case SolverParameter_SolverType_ADAGRAD: + return new AdaGradSolver(param); + default: + LOG(FATAL) << "Unknown SolverType: " << type; + } + return (Solver*) NULL; } } // namespace caffe diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 1a16c04a..1647b6f3 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -49,12 +49,12 @@ namespace caffe { // does not seem to create a memory bottleneck here. inline void CaffeMallocHost(void** ptr, size_t size) { - *ptr = malloc(size); - CHECK(*ptr) << "host allocation of size " << size << " failed"; + *ptr = malloc(size); + CHECK(*ptr) << "host allocation of size " << size << " failed"; } inline void CaffeFreeHost(void* ptr) { - free(ptr); + free(ptr); } /** @@ -64,55 +64,53 @@ inline void CaffeFreeHost(void* ptr) { * TODO(dox): more thorough description. */ class SyncedMemory { - public: - SyncedMemory() - : - cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { - ocl_setup(); - } - explicit SyncedMemory(size_t size) - : - cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false), data_layer_(false) { - ocl_setup(); - } + public: + SyncedMemory() + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_( + false), data_layer_(false) { + ocl_setup(); + } + explicit SyncedMemory(size_t size) + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_( + false), data_layer_(false) { + ocl_setup(); + } - ~SyncedMemory(); - const void* cpu_data(); - void set_cpu_data(void* data); - const void* gpu_data(); - const void* gpu_cache_data(); - void* mutable_cpu_data(); - void* mutable_gpu_data(); - enum SyncedHead { - UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED - }; - SyncedHead head() { - return head_; - } - size_t size() { - return size_; - } - void set_data_layer() { - data_layer_ = true; - } - private: - void ocl_setup(); - protected: - cl_kernel oclmem_kernel; + ~SyncedMemory(); + const void* cpu_data(); + void set_cpu_data(void* data); + const void* gpu_data(); + const void* gpu_cache_data(); + void* mutable_cpu_data(); + void* mutable_gpu_data(); + enum SyncedHead { + UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED + }; + SyncedHead head() { + return head_; + } + size_t size() { + return size_; + } + void set_data_layer() { + data_layer_ = true; + } + private: + void ocl_setup(); + protected: + cl_kernel oclmem_kernel; - private: - void to_cpu(); - void to_gpu(); - void* cpu_ptr_; - void* gpu_ptr_; - void* gpu_cache_ptr_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; - bool data_layer_; - DISABLE_COPY_AND_ASSIGN (SyncedMemory); + private: + void to_cpu(); + void to_gpu(); + void* cpu_ptr_; + void* gpu_ptr_; + void* gpu_cache_ptr_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; + bool data_layer_; + DISABLE_COPY_AND_ASSIGN (SyncedMemory); }; // class SyncedMemory diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp index 179e31ca..401e2136 100644 --- a/include/caffe/test/test_caffe_main.hpp +++ b/include/caffe/test/test_caffe_main.hpp @@ -27,27 +27,27 @@ int main(int argc, char** argv); namespace caffe { -template +template class MultiDeviceTest: public ::testing::Test { - public: - typedef typename TypeParam::Dtype Dtype; - protected: - MultiDeviceTest() { - Caffe::set_mode(TypeParam::device); - } - virtual ~MultiDeviceTest() { - } + public: + typedef typename TypeParam::Dtype Dtype; + protected: + MultiDeviceTest() { + Caffe::set_mode(TypeParam::device); + } + virtual ~MultiDeviceTest() { + } }; typedef ::testing::Types TestDtypes; -template +template struct CPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::CPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::CPU; }; -template +template class CPUDeviceTest: public MultiDeviceTest > { }; @@ -58,19 +58,18 @@ CPUDevice > TestDtypesAndDevices; #else -template +template struct GPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::GPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::GPU; }; -template +template class GPUDeviceTest: public MultiDeviceTest > { }; -typedef ::testing::Types, CPUDevice, - GPUDevice, GPUDevice > -TestDtypesAndDevices; +typedef ::testing::Types, CPUDevice, GPUDevice, + GPUDevice > TestDtypesAndDevices; #endif diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index 07fe69cf..081ce203 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -15,244 +15,244 @@ namespace caffe { // The gradient checker adds a L2 normalization loss function on top of the // top blobs, and checks the gradient. -template +template class GradientChecker { - public: - // kink and kink_range specify an ignored nonsmooth region of the form - // kink - kink_range <= |feature value| <= kink + kink_range, - // which accounts for all nonsmoothness in use by caffe - GradientChecker(const Dtype stepsize, const Dtype threshold, - const unsigned int seed = 1701, const Dtype kink = 0., - const Dtype kink_range = -1) - : stepsize_(stepsize), threshold_(threshold), seed_(seed), - kink_(kink), kink_range_(kink_range) { - } - // Checks the gradient of a layer, with provided bottom layers and top - // layers. - // Note that after the gradient check, we do not guarantee that the data - // stored in the layer parameters and the blobs are unchanged. - void CheckGradient(Layer* layer, const vector*>& bottom, - const vector*>& top, int check_bottom = -1) { - layer->SetUp(bottom, top); - CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1); - } - void CheckGradientExhaustive(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom = -1); + public: + // kink and kink_range specify an ignored nonsmooth region of the form + // kink - kink_range <= |feature value| <= kink + kink_range, + // which accounts for all nonsmoothness in use by caffe + GradientChecker(const Dtype stepsize, const Dtype threshold, + const unsigned int seed = 1701, const Dtype kink = 0., + const Dtype kink_range = -1) + : stepsize_(stepsize), threshold_(threshold), seed_(seed), kink_(kink), kink_range_( + kink_range) { + } + // Checks the gradient of a layer, with provided bottom layers and top + // layers. + // Note that after the gradient check, we do not guarantee that the data + // stored in the layer parameters and the blobs are unchanged. + void CheckGradient(Layer* layer, const vector*>& bottom, + const vector*>& top, int check_bottom = -1) { + layer->SetUp(bottom, top); + CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1); + } + void CheckGradientExhaustive(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom = -1); - // CheckGradientEltwise can be used to test layers that perform element-wise - // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when - // i != j. - void CheckGradientEltwise(Layer* layer, - const vector*>& bottom, const vector*>& top); + // CheckGradientEltwise can be used to test layers that perform element-wise + // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when + // i != j. + void CheckGradientEltwise(Layer* layer, + const vector*>& bottom, const vector*>& top); - void CheckGradientSingle(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom, int top_id, int top_data_id, bool element_wise = false); + void CheckGradientSingle(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom, int top_id, int top_data_id, + bool element_wise = false); - // Checks the gradient of a network. This network should not have any data - // layers or loss layers, since the function does not explicitly deal with - // such cases yet. All input blobs and parameter blobs are going to be - // checked, layer-by-layer to avoid numerical problems to accumulate. - void CheckGradientNet(const Net& net, - const vector*>& input); + // Checks the gradient of a network. This network should not have any data + // layers or loss layers, since the function does not explicitly deal with + // such cases yet. All input blobs and parameter blobs are going to be + // checked, layer-by-layer to avoid numerical problems to accumulate. + void CheckGradientNet(const Net& net, + const vector*>& input); - protected: - Dtype GetObjAndGradient(const Layer& layer, - const vector*>& top, int top_id = -1, int top_data_id = -1); - Dtype stepsize_; - Dtype threshold_; - unsigned int seed_; - Dtype kink_; - Dtype kink_range_; + protected: + Dtype GetObjAndGradient(const Layer& layer, + const vector*>& top, int top_id = -1, int top_data_id = -1); + Dtype stepsize_; + Dtype threshold_; + unsigned int seed_; + Dtype kink_; + Dtype kink_range_; }; -template +template void GradientChecker::CheckGradientSingle(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom, int top_id, int top_data_id, bool element_wise) { - if (element_wise) { - CHECK_EQ(0, layer->blobs().size()); - CHECK_LE(0, top_id); - CHECK_LE(0, top_data_id); - const int top_count = top[top_id]->count(); - for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) { - CHECK_EQ(top_count, bottom[blob_id]->count()); - } - } - // First, figure out what blobs we need to check against, and zero init - // parameter blobs. - vector*> blobs_to_check; - vector propagate_down(bottom.size(), check_bottom < 0); - for (int i = 0; i < layer->blobs().size(); ++i) { - Blob* blob = layer->blobs()[i].get(); - caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); - blobs_to_check.push_back(blob); - } - if (check_bottom < 0) { - for (int i = 0; i < bottom.size(); ++i) { - blobs_to_check.push_back(bottom[i]); - } - } else { - CHECK_LT(check_bottom, bottom.size()); - blobs_to_check.push_back(bottom[check_bottom]); - propagate_down[check_bottom] = true; - } - // Compute the gradient analytically using Backward - Caffe::set_random_seed(seed_); - // Ignore the loss from the layer (it's just the weighted sum of the losses - // from the top blobs, whose gradients we may want to test individually). - layer->Forward(bottom, top); - // Get additional loss from the objective - GetObjAndGradient(*layer, top, top_id, top_data_id); - layer->Backward(top, propagate_down, bottom); - // Store computed gradients for all checked blobs - vector < shared_ptr > > - computed_gradient_blobs(blobs_to_check.size()); - for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { - Blob* current_blob = blobs_to_check[blob_id]; - computed_gradient_blobs[blob_id].reset(new Blob()); - computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); - const int count = blobs_to_check[blob_id]->count(); - const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); - Dtype* computed_gradients = - computed_gradient_blobs[blob_id]->mutable_cpu_data(); - caffe_copy(count, diff, computed_gradients); - } - // Compute derivative of top w.r.t. each bottom and parameter input using - // finite differencing. - // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; - for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { - Blob* current_blob = blobs_to_check[blob_id]; - const Dtype* computed_gradients = - computed_gradient_blobs[blob_id]->cpu_data(); - // LOG(ERROR) << "Blob " << blob_id << ": checking " - // << current_blob->count() << " parameters."; - for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { - // For an element-wise layer, we only need to do finite differencing to - // compute the derivative of top[top_id][top_data_id] w.r.t. - // bottom[blob_id][i] only for i == top_data_id. For any other - // i != top_data_id, we know the derivative is 0 by definition, and simply - // check that that's true. - Dtype estimated_gradient = 0; - Dtype positive_objective = 0; - Dtype negative_objective = 0; - if (!element_wise || (feat_id == top_data_id)) { - // Do finite differencing. - // Compute loss with stepsize_ added to input. - current_blob->mutable_cpu_data()[feat_id] += stepsize_; - Caffe::set_random_seed(seed_); - layer->Forward(bottom, top); - positive_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); - // Compute loss with stepsize_ subtracted from input. - current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; - Caffe::set_random_seed(seed_); - layer->Forward(bottom, top); - negative_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); - // Recover original input value. - current_blob->mutable_cpu_data()[feat_id] += stepsize_; - estimated_gradient = (positive_objective - negative_objective) / - stepsize_ / 2.; - } - Dtype computed_gradient = computed_gradients[feat_id]; - Dtype feature = current_blob->cpu_data()[feat_id]; - // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " - // << current_blob->cpu_diff()[feat_id]; - if (kink_ - kink_range_ > fabs(feature) - || fabs(feature) > kink_ + kink_range_) { - // We check relative accuracy, but for too small values, we threshold - // the scale factor by 1. - Dtype scale = std::max( - std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); - EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) - << "debug: (top_id, top_data_id, blob_id, feat_id)=" - << top_id << "," << top_data_id << "," << blob_id << "," << feat_id - << "; feat = " << feature - << "; objective+ = " << positive_objective - << "; objective- = " << negative_objective; - } - // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; - // LOG(ERROR) << "computed gradient: " << computed_gradient - // << " estimated_gradient: " << estimated_gradient; - } - } + const vector*>& bottom, const vector*>& top, + int check_bottom, int top_id, int top_data_id, bool element_wise) { + if (element_wise) { + CHECK_EQ(0, layer->blobs().size()); + CHECK_LE(0, top_id); + CHECK_LE(0, top_data_id); + const int top_count = top[top_id]->count(); + for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) { + CHECK_EQ(top_count, bottom[blob_id]->count()); + } + } + // First, figure out what blobs we need to check against, and zero init + // parameter blobs. + vector*> blobs_to_check; + vector propagate_down(bottom.size(), check_bottom < 0); + for (int i = 0; i < layer->blobs().size(); ++i) { + Blob* blob = layer->blobs()[i].get(); + caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); + blobs_to_check.push_back(blob); + } + if (check_bottom < 0) { + for (int i = 0; i < bottom.size(); ++i) { + blobs_to_check.push_back(bottom[i]); + } + } else { + CHECK_LT(check_bottom, bottom.size()); + blobs_to_check.push_back(bottom[check_bottom]); + propagate_down[check_bottom] = true; + } + // Compute the gradient analytically using Backward + Caffe::set_random_seed(seed_); + // Ignore the loss from the layer (it's just the weighted sum of the losses + // from the top blobs, whose gradients we may want to test individually). + layer->Forward(bottom, top); + // Get additional loss from the objective + GetObjAndGradient(*layer, top, top_id, top_data_id); + layer->Backward(top, propagate_down, bottom); + // Store computed gradients for all checked blobs + vector < shared_ptr > + > computed_gradient_blobs(blobs_to_check.size()); + for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { + Blob* current_blob = blobs_to_check[blob_id]; + computed_gradient_blobs[blob_id].reset(new Blob()); + computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); + const int count = blobs_to_check[blob_id]->count(); + const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); + Dtype* computed_gradients = + computed_gradient_blobs[blob_id]->mutable_cpu_data(); + caffe_copy(count, diff, computed_gradients); + } + // Compute derivative of top w.r.t. each bottom and parameter input using + // finite differencing. + // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; + for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { + Blob* current_blob = blobs_to_check[blob_id]; + const Dtype* computed_gradients = + computed_gradient_blobs[blob_id]->cpu_data(); + // LOG(ERROR) << "Blob " << blob_id << ": checking " + // << current_blob->count() << " parameters."; + for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { + // For an element-wise layer, we only need to do finite differencing to + // compute the derivative of top[top_id][top_data_id] w.r.t. + // bottom[blob_id][i] only for i == top_data_id. For any other + // i != top_data_id, we know the derivative is 0 by definition, and simply + // check that that's true. + Dtype estimated_gradient = 0; + Dtype positive_objective = 0; + Dtype negative_objective = 0; + if (!element_wise || (feat_id == top_data_id)) { + // Do finite differencing. + // Compute loss with stepsize_ added to input. + current_blob->mutable_cpu_data()[feat_id] += stepsize_; + Caffe::set_random_seed(seed_); + layer->Forward(bottom, top); + positive_objective = GetObjAndGradient(*layer, top, top_id, + top_data_id); + // Compute loss with stepsize_ subtracted from input. + current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; + Caffe::set_random_seed(seed_); + layer->Forward(bottom, top); + negative_objective = GetObjAndGradient(*layer, top, top_id, + top_data_id); + // Recover original input value. + current_blob->mutable_cpu_data()[feat_id] += stepsize_; + estimated_gradient = (positive_objective - negative_objective) + / stepsize_ / 2.; + } + Dtype computed_gradient = computed_gradients[feat_id]; + Dtype feature = current_blob->cpu_data()[feat_id]; + // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " + // << current_blob->cpu_diff()[feat_id]; + if (kink_ - kink_range_ > fabs(feature) + || fabs(feature) > kink_ + kink_range_) { + // We check relative accuracy, but for too small values, we threshold + // the scale factor by 1. + Dtype scale = std::max( + std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); + EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) + << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id + << "," << top_data_id << "," << blob_id << "," << feat_id + << "; feat = " << feature << "; objective+ = " << positive_objective + << "; objective- = " << negative_objective; + } + // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; + // LOG(ERROR) << "computed gradient: " << computed_gradient + // << " estimated_gradient: " << estimated_gradient; + } + } } -template +template void GradientChecker::CheckGradientExhaustive(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom) { - layer->SetUp(bottom, top); - CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob."; - // LOG(ERROR) << "Exhaustive Mode."; - for (int i = 0; i < top.size(); ++i) { - // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count(); - for (int j = 0; j < top[i]->count(); ++j) { - // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j; - CheckGradientSingle(layer, bottom, top, check_bottom, i, j); - } - } + const vector*>& bottom, const vector*>& top, + int check_bottom) { + layer->SetUp(bottom, top); + CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob."; + // LOG(ERROR) << "Exhaustive Mode."; + for (int i = 0; i < top.size(); ++i) { + // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count(); + for (int j = 0; j < top[i]->count(); ++j) { + // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j; + CheckGradientSingle(layer, bottom, top, check_bottom, i, j); + } + } } -template +template void GradientChecker::CheckGradientEltwise(Layer* layer, - const vector*>& bottom, const vector*>& top) { - layer->SetUp(bottom, top); - CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob."; - const int check_bottom = -1; - const bool element_wise = true; - for (int i = 0; i < top.size(); ++i) { - for (int j = 0; j < top[i]->count(); ++j) { - CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise); - } - } + const vector*>& bottom, const vector*>& top) { + layer->SetUp(bottom, top); + CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob."; + const int check_bottom = -1; + const bool element_wise = true; + for (int i = 0; i < top.size(); ++i) { + for (int j = 0; j < top[i]->count(); ++j) { + CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise); + } + } } -template -void GradientChecker::CheckGradientNet( - const Net& net, const vector*>& input) { - const vector > >& layers = net.layers(); - vector < vector*> > &bottom_vecs = net.bottom_vecs(); - vector < vector*> > &top_vecs = net.top_vecs(); - for (int i = 0; i < layers.size(); ++i) { - net.Forward(input); - LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name(); - CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]); - } +template +void GradientChecker::CheckGradientNet(const Net& net, + const vector*>& input) { + const vector > >& layers = net.layers(); + vector < vector*> > &bottom_vecs = net.bottom_vecs(); + vector < vector*> > &top_vecs = net.top_vecs(); + for (int i = 0; i < layers.size(); ++i) { + net.Forward(input); + LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name(); + CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]); + } } -template +template Dtype GradientChecker::GetObjAndGradient(const Layer& layer, - const vector*>& top, int top_id, int top_data_id) { - Dtype loss = 0; - if (top_id < 0) { - // the loss will be half of the sum of squares of all outputs - for (int i = 0; i < top.size(); ++i) { - Blob* top_blob = top[i]; - const Dtype* top_blob_data = top_blob->cpu_data(); - Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); - int count = top_blob->count(); - for (int j = 0; j < count; ++j) { - loss += top_blob_data[j] * top_blob_data[j]; - } - // set the diff: simply the data. - caffe_copy(top_blob->count(), top_blob_data, top_blob_diff); - } - loss /= 2.; - } else { - // the loss will be the top_data_id-th element in the top_id-th blob. - for (int i = 0; i < top.size(); ++i) { - Blob* top_blob = top[i]; - Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); - caffe_set(top_blob->count(), Dtype(0), top_blob_diff); - } - const Dtype loss_weight = 2; - loss = top[top_id]->cpu_data()[top_data_id] * loss_weight; - top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight; - } - return loss; + const vector*>& top, int top_id, int top_data_id) { + Dtype loss = 0; + if (top_id < 0) { + // the loss will be half of the sum of squares of all outputs + for (int i = 0; i < top.size(); ++i) { + Blob* top_blob = top[i]; + const Dtype* top_blob_data = top_blob->cpu_data(); + Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); + int count = top_blob->count(); + for (int j = 0; j < count; ++j) { + loss += top_blob_data[j] * top_blob_data[j]; + } + // set the diff: simply the data. + caffe_copy(top_blob->count(), top_blob_data, top_blob_diff); + } + loss /= 2.; + } else { + // the loss will be the top_data_id-th element in the top_id-th blob. + for (int i = 0; i < top.size(); ++i) { + Blob* top_blob = top[i]; + Dtype* top_blob_diff = top_blob->mutable_cpu_diff(); + caffe_set(top_blob->count(), Dtype(0), top_blob_diff); + } + const Dtype loss_weight = 2; + loss = top[top_id]->cpu_data()[top_data_id] * loss_weight; + top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight; + } + return loss; } } // namespace caffe diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp index f5818f6f..f48be453 100644 --- a/include/caffe/util/benchmark.hpp +++ b/include/caffe/util/benchmark.hpp @@ -8,50 +8,50 @@ namespace caffe { class Timer { - public: - Timer(); - virtual ~Timer(); - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); - virtual float Seconds(); - - inline bool initted() { - return initted_; - } - inline bool running() { - return running_; - } - inline bool has_run_at_least_once() { - return has_run_at_least_once_; - } - - protected: - void Init(); - - bool initted_; - bool running_; - bool has_run_at_least_once_; - #ifndef CPU_ONLY - //cudaEvent_t start_gpu_; - //cudaEvent_t stop_gpu_; + public: + Timer(); + virtual ~Timer(); + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); + virtual float Seconds(); + + inline bool initted() { + return initted_; + } + inline bool running() { + return running_; + } + inline bool has_run_at_least_once() { + return has_run_at_least_once_; + } + + protected: + void Init(); + + bool initted_; + bool running_; + bool has_run_at_least_once_; +#ifndef CPU_ONLY + //cudaEvent_t start_gpu_; + //cudaEvent_t stop_gpu_; #endif - boost::posix_time::ptime start_cpu_; - boost::posix_time::ptime stop_cpu_; - float elapsed_milliseconds_; - float elapsed_microseconds_; + boost::posix_time::ptime start_cpu_; + boost::posix_time::ptime stop_cpu_; + float elapsed_milliseconds_; + float elapsed_microseconds_; }; class CPUTimer: public Timer { - public: - explicit CPUTimer(); - virtual ~CPUTimer() { - } - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); + public: + explicit CPUTimer(); + virtual ~CPUTimer() { + } + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); }; } // namespace caffe diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp index 1ff29356..1994c48a 100644 --- a/include/caffe/util/cudnn.hpp +++ b/include/caffe/util/cudnn.hpp @@ -15,116 +15,116 @@ } while (0) inline const char* cudnnGetErrorString(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; - } - return "Unknown cudnn status"; + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + } + return "Unknown cudnn status"; } namespace caffe { - namespace cudnn { - - template class dataType; - template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_FLOAT; - static float oneval, zeroval; - static const void *one, *zero; - }; - template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; - static double oneval, zeroval; - static const void *one, *zero; - }; - - template - inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { - CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); - } - - template - inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w, - int stride_n, int stride_c, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, - n, c, h, w, stride_n, stride_c, stride_h, stride_w)); - } - - template - inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w) { - const int stride_w = 1; - const int stride_h = w * stride_w; - const int stride_c = h * stride_h; - const int stride_n = c * stride_c; - setTensor4dDesc(desc, n, c, h, w, - stride_n, stride_c, stride_h, stride_w); - } - - template - inline void createFilterDesc(cudnnFilterDescriptor_t* desc, - int n, int c, int h, int w) { - CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, - n, c, h, w)); - } - - template - inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { - CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); - } - - template - inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, - cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, - int pad_h, int pad_w, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, - pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); - } - - template - inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, - PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, - int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { - switch (poolmethod) { - case PoolingParameter_PoolMethod_MAX: - *mode = CUDNN_POOLING_MAX; - break; - case PoolingParameter_PoolMethod_AVE: - *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); - CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, - pad_h, pad_w, stride_h, stride_w)); - } - - } // namespace cudnn + namespace cudnn { + + template class dataType; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_FLOAT; + static float oneval, zeroval; + static const void *one, *zero; + }; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; + static double oneval, zeroval; + static const void *one, *zero; + }; + + template + inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { + CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w, + int stride_n, int stride_c, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, + n, c, h, w, stride_n, stride_c, stride_h, stride_w)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w) { + const int stride_w = 1; + const int stride_h = w * stride_w; + const int stride_c = h * stride_h; + const int stride_n = c * stride_c; + setTensor4dDesc(desc, n, c, h, w, + stride_n, stride_c, stride_h, stride_w); + } + + template + inline void createFilterDesc(cudnnFilterDescriptor_t* desc, + int n, int c, int h, int w) { + CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); + CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, + n, c, h, w)); + } + + template + inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { + CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); + } + + template + inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, + cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, + int pad_h, int pad_w, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, + pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); + } + + template + inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, + PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, + int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { + switch (poolmethod) { + case PoolingParameter_PoolMethod_MAX: + *mode = CUDNN_POOLING_MAX; + break; + case PoolingParameter_PoolMethod_AVE: + *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); + CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, + pad_h, pad_w, stride_h, stride_w)); + } + + } // namespace cudnn } // namespace caffe diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp index a65e3acf..a872fb07 100644 --- a/include/caffe/util/db.hpp +++ b/include/caffe/util/db.hpp @@ -10,48 +10,48 @@ namespace caffe { namespace db { enum Mode { - READ, WRITE, NEW + READ, WRITE, NEW }; class Cursor { - public: - Cursor() { - } - virtual ~Cursor() { - } - virtual void SeekToFirst() = 0; - virtual void Next() = 0; - virtual string key() = 0; - virtual string value() = 0; - virtual bool valid() = 0; - - DISABLE_COPY_AND_ASSIGN (Cursor); + public: + Cursor() { + } + virtual ~Cursor() { + } + virtual void SeekToFirst() = 0; + virtual void Next() = 0; + virtual string key() = 0; + virtual string value() = 0; + virtual bool valid() = 0; + + DISABLE_COPY_AND_ASSIGN (Cursor); }; class Transaction { - public: - Transaction() { - } - virtual ~Transaction() { - } - virtual void Put(const string& key, const string& value) = 0; - virtual void Commit() = 0; - - DISABLE_COPY_AND_ASSIGN (Transaction); + public: + Transaction() { + } + virtual ~Transaction() { + } + virtual void Put(const string& key, const string& value) = 0; + virtual void Commit() = 0; + + DISABLE_COPY_AND_ASSIGN (Transaction); }; class DB { - public: - DB() { - } - virtual ~DB() { - } - virtual void Open(const string& source, Mode mode) = 0; - virtual void Close() = 0; - virtual Cursor* NewCursor() = 0; - virtual Transaction* NewTransaction() = 0; - - DISABLE_COPY_AND_ASSIGN (DB); + public: + DB() { + } + virtual ~DB() { + } + virtual void Open(const string& source, Mode mode) = 0; + virtual void Close() = 0; + virtual Cursor* NewCursor() = 0; + virtual Transaction* NewTransaction() = 0; + + DISABLE_COPY_AND_ASSIGN (DB); }; DB* GetDB(DataParameter::DB backend); diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp index c63fdbb0..c0f6ab62 100644 --- a/include/caffe/util/db_leveldb.hpp +++ b/include/caffe/util/db_leveldb.hpp @@ -12,83 +12,80 @@ namespace caffe { namespace db { class LevelDBCursor: public Cursor { - public: - explicit LevelDBCursor(leveldb::Iterator* iter) - : - iter_(iter) { - SeekToFirst(); - } - ~LevelDBCursor() { - delete iter_; - } - virtual void SeekToFirst() { - iter_->SeekToFirst(); - } - virtual void Next() { - iter_->Next(); - } - virtual string key() { - return iter_->key().ToString(); - } - virtual string value() { - return iter_->value().ToString(); - } - virtual bool valid() { - return iter_->Valid(); - } + public: + explicit LevelDBCursor(leveldb::Iterator* iter) + : iter_(iter) { + SeekToFirst(); + } + ~LevelDBCursor() { + delete iter_; + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + } + virtual void Next() { + iter_->Next(); + } + virtual string key() { + return iter_->key().ToString(); + } + virtual string value() { + return iter_->value().ToString(); + } + virtual bool valid() { + return iter_->Valid(); + } - private: - leveldb::Iterator* iter_; + private: + leveldb::Iterator* iter_; }; class LevelDBTransaction: public Transaction { - public: - explicit LevelDBTransaction(leveldb::DB* db) - : - db_(db) { - CHECK_NOTNULL(db_); - } - virtual void Put(const string& key, const string& value) { - batch_.Put(key, value); - } - virtual void Commit() { - leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); - CHECK(status.ok()) << "Failed to write batch to leveldb " - << std::endl << status.ToString(); - } + public: + explicit LevelDBTransaction(leveldb::DB* db) + : db_(db) { + CHECK_NOTNULL(db_); + } + virtual void Put(const string& key, const string& value) { + batch_.Put(key, value); + } + virtual void Commit() { + leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); + CHECK(status.ok()) << "Failed to write batch to leveldb " << std::endl + << status.ToString(); + } - private: - leveldb::DB* db_; - leveldb::WriteBatch batch_; + private: + leveldb::DB* db_; + leveldb::WriteBatch batch_; - DISABLE_COPY_AND_ASSIGN (LevelDBTransaction); + DISABLE_COPY_AND_ASSIGN (LevelDBTransaction); }; class LevelDB: public DB { - public: - LevelDB() - : - db_(NULL) { - } - virtual ~LevelDB() { - Close(); - } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (db_ != NULL) { - delete db_; - db_ = NULL; - } - } - virtual LevelDBCursor* NewCursor() { - return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); - } - virtual LevelDBTransaction* NewTransaction() { - return new LevelDBTransaction(db_); - } + public: + LevelDB() + : db_(NULL) { + } + virtual ~LevelDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (db_ != NULL) { + delete db_; + db_ = NULL; + } + } + virtual LevelDBCursor* NewCursor() { + return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); + } + virtual LevelDBTransaction* NewTransaction() { + return new LevelDBTransaction(db_); + } - private: - leveldb::DB* db_; + private: + leveldb::DB* db_; }; } // namespace db diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp index 68cbb93a..232b439a 100644 --- a/include/caffe/util/db_lmdb.hpp +++ b/include/caffe/util/db_lmdb.hpp @@ -11,96 +11,93 @@ namespace caffe { namespace db { inline void MDB_CHECK(int mdb_status) { - CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); + CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); } class LMDBCursor: public Cursor { - public: - explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) - : - mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { - SeekToFirst(); - } - virtual ~LMDBCursor() { - mdb_cursor_close(mdb_cursor_); - mdb_txn_abort(mdb_txn_); - } - virtual void SeekToFirst() { - Seek (MDB_FIRST); - } - virtual void Next() { - Seek (MDB_NEXT); - } - virtual string key() { - return string(static_cast(mdb_key_.mv_data), - mdb_key_.mv_size); - } - virtual string value() { - return string(static_cast(mdb_value_.mv_data), - mdb_value_.mv_size); - } - virtual bool valid() { - return valid_; - } + public: + explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) + : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { + SeekToFirst(); + } + virtual ~LMDBCursor() { + mdb_cursor_close(mdb_cursor_); + mdb_txn_abort(mdb_txn_); + } + virtual void SeekToFirst() { + Seek (MDB_FIRST); + } + virtual void Next() { + Seek (MDB_NEXT); + } + virtual string key() { + return string(static_cast(mdb_key_.mv_data), + mdb_key_.mv_size); + } + virtual string value() { + return string(static_cast(mdb_value_.mv_data), + mdb_value_.mv_size); + } + virtual bool valid() { + return valid_; + } - private: - void Seek(MDB_cursor_op op) { - int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); - if (mdb_status == MDB_NOTFOUND) { - valid_ = false; - } else { - MDB_CHECK(mdb_status); - valid_ = true; - } - } + private: + void Seek(MDB_cursor_op op) { + int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); + if (mdb_status == MDB_NOTFOUND) { + valid_ = false; + } else { + MDB_CHECK(mdb_status); + valid_ = true; + } + } - MDB_txn* mdb_txn_; - MDB_cursor* mdb_cursor_; - MDB_val mdb_key_, mdb_value_; - bool valid_; + MDB_txn* mdb_txn_; + MDB_cursor* mdb_cursor_; + MDB_val mdb_key_, mdb_value_; + bool valid_; }; class LMDBTransaction: public Transaction { - public: - explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) - : - mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { - } - virtual void Put(const string& key, const string& value); - virtual void Commit() { - MDB_CHECK(mdb_txn_commit(mdb_txn_)); - } + public: + explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) + : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { + } + virtual void Put(const string& key, const string& value); + virtual void Commit() { + MDB_CHECK(mdb_txn_commit(mdb_txn_)); + } - private: - MDB_dbi* mdb_dbi_; - MDB_txn* mdb_txn_; + private: + MDB_dbi* mdb_dbi_; + MDB_txn* mdb_txn_; - DISABLE_COPY_AND_ASSIGN (LMDBTransaction); + DISABLE_COPY_AND_ASSIGN (LMDBTransaction); }; class LMDB: public DB { - public: - LMDB() - : - mdb_env_(NULL) { - } - virtual ~LMDB() { - Close(); - } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (mdb_env_ != NULL) { - mdb_dbi_close(mdb_env_, mdb_dbi_); - mdb_env_close(mdb_env_); - mdb_env_ = NULL; - } - } - virtual LMDBCursor* NewCursor(); - virtual LMDBTransaction* NewTransaction(); + public: + LMDB() + : mdb_env_(NULL) { + } + virtual ~LMDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (mdb_env_ != NULL) { + mdb_dbi_close(mdb_env_, mdb_dbi_); + mdb_env_close(mdb_env_); + mdb_env_ = NULL; + } + } + virtual LMDBCursor* NewCursor(); + virtual LMDBTransaction* NewTransaction(); - private: - MDB_env* mdb_env_; - MDB_dbi mdb_dbi_; + private: + MDB_env* mdb_env_; + MDB_dbi mdb_dbi_; }; } // namespace db diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index ee7ea10b..f962049d 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -30,83 +30,73 @@ namespace caffe { template -void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); +void im2col_cpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col); template -void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); +void col2im_cpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); template -void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset); +void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, + const int width, const int channels, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset); template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset); template -void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); +void im2col_gpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col); template -void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); +void col2im_gpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); template void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_col, const int col_offset); template void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_col, const int col_offset); template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_col, const int col_offset, + int optnum); template void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im, const int img_offset); + const int channels, const int height, const int width, const int psize, + const int pad, const int stride, Dtype* data_im, const int img_offset); template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_im, const int img_offset, + int optnum); template -void col2im_gpu_ocl(cl_mem data_col, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, cl_kernel Kernel); +void col2im_gpu_ocl(cl_mem data_col, const int channels, const int height, + const int width, const int ksize, const int pad, const int stride, + Dtype* data_im, cl_kernel Kernel); template -void im2col_gpu_ocl(cl_mem data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, cl_kernel Kernel); +void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height, + const int width, const int ksize, const int pad, const int stride, + Dtype* data_col, cl_kernel Kernel); } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp index c9a40c54..446abb81 100644 --- a/include/caffe/util/insert_splits.hpp +++ b/include/caffe/util/insert_splits.hpp @@ -12,14 +12,14 @@ namespace caffe { void InsertSplits(const NetParameter& param, NetParameter* param_split); void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param); + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param); string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx); + const int blob_idx); string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx); + const int blob_idx, const int split_idx); } // namespace caffe diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 7bd1d2db..c04cce6a 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -19,118 +19,116 @@ namespace caffe { using ::google::protobuf::Message; inline void MakeTempFilename(string* temp_filename) { - temp_filename->clear(); - *temp_filename = "/tmp/caffe_test.XXXXXX"; - char* temp_filename_cstr = new char[temp_filename->size() + 1]; - // NOLINT_NEXT_LINE(runtime/printf) - strcpy(temp_filename_cstr, temp_filename->c_str()); - int fd = mkstemp(temp_filename_cstr); - CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename; - close(fd); - *temp_filename = temp_filename_cstr; - delete[] temp_filename_cstr; + temp_filename->clear(); + *temp_filename = "/tmp/caffe_test.XXXXXX"; + char* temp_filename_cstr = new char[temp_filename->size() + 1]; + // NOLINT_NEXT_LINE(runtime/printf) + strcpy(temp_filename_cstr, temp_filename->c_str()); + int fd = mkstemp(temp_filename_cstr); + CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename; + close(fd); + *temp_filename = temp_filename_cstr; + delete[] temp_filename_cstr; } inline void MakeTempDir(string* temp_dirname) { - temp_dirname->clear(); - *temp_dirname = "/tmp/caffe_test.XXXXXX"; - char* temp_dirname_cstr = new char[temp_dirname->size() + 1]; - // NOLINT_NEXT_LINE(runtime/printf) - strcpy(temp_dirname_cstr, temp_dirname->c_str()); - char* mkdtemp_result = mkdtemp(temp_dirname_cstr); - CHECK(mkdtemp_result != NULL) - << "Failed to create a temporary directory at: " << *temp_dirname; - *temp_dirname = temp_dirname_cstr; - delete[] temp_dirname_cstr; + temp_dirname->clear(); + *temp_dirname = "/tmp/caffe_test.XXXXXX"; + char* temp_dirname_cstr = new char[temp_dirname->size() + 1]; + // NOLINT_NEXT_LINE(runtime/printf) + strcpy(temp_dirname_cstr, temp_dirname->c_str()); + char* mkdtemp_result = mkdtemp(temp_dirname_cstr); + CHECK(mkdtemp_result != NULL) << "Failed to create a temporary directory at: " + << *temp_dirname; + *temp_dirname = temp_dirname_cstr; + delete[] temp_dirname_cstr; } bool ReadProtoFromTextFile(const char* filename, Message* proto); inline bool ReadProtoFromTextFile(const string& filename, Message* proto) { - return ReadProtoFromTextFile(filename.c_str(), proto); + return ReadProtoFromTextFile(filename.c_str(), proto); } inline void ReadProtoFromTextFileOrDie(const char* filename, Message* proto) { - CHECK(ReadProtoFromTextFile(filename, proto)); + CHECK(ReadProtoFromTextFile(filename, proto)); } inline void ReadProtoFromTextFileOrDie(const string& filename, Message* proto) { - ReadProtoFromTextFileOrDie(filename.c_str(), proto); + ReadProtoFromTextFileOrDie(filename.c_str(), proto); } void WriteProtoToTextFile(const Message& proto, const char* filename); inline void WriteProtoToTextFile(const Message& proto, const string& filename) { - WriteProtoToTextFile(proto, filename.c_str()); + WriteProtoToTextFile(proto, filename.c_str()); } bool ReadProtoFromBinaryFile(const char* filename, Message* proto); inline bool ReadProtoFromBinaryFile(const string& filename, Message* proto) { - return ReadProtoFromBinaryFile(filename.c_str(), proto); + return ReadProtoFromBinaryFile(filename.c_str(), proto); } inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) { - CHECK(ReadProtoFromBinaryFile(filename, proto)); + CHECK(ReadProtoFromBinaryFile(filename, proto)); } inline void ReadProtoFromBinaryFileOrDie(const string& filename, - Message* proto) { - ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); + Message* proto) { + ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); } void WriteProtoToBinaryFile(const Message& proto, const char* filename); -inline void WriteProtoToBinaryFile( - const Message& proto, const string& filename) { - WriteProtoToBinaryFile(proto, filename.c_str()); +inline void WriteProtoToBinaryFile(const Message& proto, + const string& filename) { + WriteProtoToBinaryFile(proto, filename.c_str()); } bool ReadFileToDatum(const string& filename, const int label, Datum* datum); inline bool ReadFileToDatum(const string& filename, Datum* datum) { - return ReadFileToDatum(filename, -1, datum); + return ReadFileToDatum(filename, -1, datum); } -bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum); +bool ReadImageToDatum(const string& filename, const int label, const int height, + const int width, const bool is_color, const std::string & encoding, + Datum* datum); inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, Datum* datum) { - return ReadImageToDatum(filename, label, height, width, is_color, - "", datum); + const int height, const int width, const bool is_color, Datum* datum) { + return ReadImageToDatum(filename, label, height, width, is_color, "", datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, Datum* datum) { - return ReadImageToDatum(filename, label, height, width, true, datum); + const int height, const int width, Datum* datum) { + return ReadImageToDatum(filename, label, height, width, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const bool is_color, Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, is_color, datum); + const bool is_color, Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, is_color, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, true, datum); + Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, true, datum); } inline bool ReadImageToDatum(const string& filename, const int label, - const std::string & encoding, Datum* datum) { - return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum); + const std::string & encoding, Datum* datum) { + return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum); } bool DecodeDatumNative(Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color); +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width, const bool is_color); -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width); +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width); -cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color); +cv::Mat ReadImageToCVMat(const string& filename, const bool is_color); cv::Mat ReadImageToCVMat(const string& filename); @@ -140,18 +138,16 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); template -void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); +void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_, + int min_dim, int max_dim, Blob* blob); template -void hdf5_load_nd_dataset( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); +void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim, + int max_dim, Blob* blob); template -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob); +void hdf5_save_nd_dataset(const hid_t file_id, const string& dataset_name, + const Blob& blob); } // namespace caffe diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 8a36069a..d7c67673 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -41,71 +41,64 @@ namespace caffe { // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template -void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, const Dtype* A, + const Dtype* B, const Dtype beta, Dtype* C); // Decaf gpu gemm provides an interface that is almost the same as the cpu // gemm function - following the c convention and calling the fortran-order // gpu code under the hood. template -void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, const Dtype* A, + const Dtype* B, const Dtype beta, Dtype* C); template cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, - const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, Dtype* C, const int offC); /*This is Yuan Gao's sgemm_ex*/ template void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C, const int offset1, const int offset2, const int offset3); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C, const int offset1, const int offset2, const int offset3); template cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, - const int offB, const Dtype beta, - Dtype* C, const int offC); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, Dtype* C, const int offC); template void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); template -void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda, - const Dtype * x, size_t offx, const Dtype beta, int incx, - Dtype* y, size_t offy, int incy); +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, size_t offA, int lda, const Dtype * x, + size_t offx, const Dtype beta, int incx, Dtype* y, size_t offy, int incy); template void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); template -void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); template -void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); template void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); template void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + const Dtype beta, Dtype* Y); template void caffe_copy(const int N, const Dtype *X, Dtype *Y); @@ -117,14 +110,14 @@ template void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); inline void caffe_memset(const size_t N, const int alpha, void* X) { - memset(X, alpha, N); // NOLINT(caffe/alt_fn) + memset(X, alpha, N); // NOLINT(caffe/alt_fn) } inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { #ifndef CPU_ONLY - ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N); + ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N); #else - NO_GPU; + NO_GPU; #endif } @@ -144,7 +137,7 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); template void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, - Dtype *X); + Dtype *X); template void caffe_scal(const int N, const Dtype alpha, Dtype *X); @@ -176,7 +169,7 @@ void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); template void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, - const Dtype* b, Dtype* y); + const Dtype* b, Dtype* y); template void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); @@ -207,11 +200,11 @@ void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); template void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); template void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); template void caffe_rng_bernoulli(const int n, const Dtype p, int* r); @@ -236,7 +229,7 @@ int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); template uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, - const Dtype* y); + const Dtype* y); // Returns the sum of the absolute values of the elements of vector x template @@ -249,7 +242,7 @@ void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c template inline char caffe_sign(Dtype val) { - return (Dtype(0) < val) - (val < Dtype(0)); + return (Dtype(0) < val) - (val < Dtype(0)); } // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC @@ -345,7 +338,7 @@ void caffe_log(const int n, const Dtype* a, Dtype* y); template Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, - const Dtype* y, const int incy); + const Dtype* y, const int incy); } // namespace caffe #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp index 06262fbf..2ca24374 100644 --- a/include/caffe/util/mkl_alternate.hpp +++ b/include/caffe/util/mkl_alternate.hpp @@ -81,16 +81,14 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]); // in standard blas. We will simply use a two-step (inefficient, of course) way // to mimic that. inline void cblas_saxpby(const int N, const float alpha, const float* X, - const int incX, const float beta, float* Y, - const int incY) { - cblas_sscal(N, beta, Y, incY); - cblas_saxpy(N, alpha, X, incX, Y, incY); + const int incX, const float beta, float* Y, const int incY) { + cblas_sscal(N, beta, Y, incY); + cblas_saxpy(N, alpha, X, incX, Y, incY); } inline void cblas_daxpby(const int N, const double alpha, const double* X, - const int incX, const double beta, double* Y, - const int incY) { - cblas_dscal(N, beta, Y, incY); - cblas_daxpy(N, alpha, X, incX, Y, incY); + const int incX, const double beta, double* Y, const int incY) { + cblas_dscal(N, beta, Y, incY); + cblas_daxpy(N, alpha, X, incX, Y, incY); } #endif // USE_MKL diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 9febaa04..776fec11 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -33,7 +33,7 @@ template void ocl_memset(Dtype* buffer, const Dtype value, const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, - const int count); + const int count); void eventCallback(cl_event event, cl_int event_status, void * user_data); } // namespace caffe diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 869bc83b..25a86090 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -31,189 +31,188 @@ namespace caffe { typedef unsigned int uint32_t; -template inline std::string get_dtype_suffix() -{ - dtype x; - const char type = typeid(x).name()[0]; - std::string suffix; - switch (type) { - case 'i': - suffix = "_int"; - break; - case 'd': - suffix = "_double"; - break; - case 'f': - default: - suffix = "_float"; - } - return suffix; +template inline std::string get_dtype_suffix() { + dtype x; + const char type = typeid(x).name()[0]; + std::string suffix; + switch (type) { + case 'i': + suffix = "_int"; + break; + case 'd': + suffix = "_double"; + break; + case 'f': + default: + suffix = "_float"; + } + return suffix; } template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, - const int M_, const int packing_num); + const int M_, const int packing_num); template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, - const int optnum); + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum); template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* bottom_data, Dtype* scale_data); + const Dtype* bottom_data, Dtype* scale_data); template void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out); template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* scale, Dtype* data); + const Dtype* scale, Dtype* data); template Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* prob_data, const Dtype* label, cl_mem d_loss); + const Dtype* prob_data, const Dtype* label, cl_mem d_loss); template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data); template void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, - const Dtype* label); + const Dtype* label); template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - Dtype* top_data); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data); template void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, - Dtype* top_mask); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask); template void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff); + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); template void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff); + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, - const Dtype* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, Dtype* const bottom_diff); + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff); template void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data); template void SigmoidBackward(const int count, const Dtype* top_diff, - const Dtype* top_data, Dtype* bottom_diff); + const Dtype* top_data, Dtype* bottom_diff); template void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data); template void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, - Dtype* bottom_diff); + Dtype* bottom_diff); template void ThresholdForward(const int count, const Dtype threshold, - const Dtype* bottom_data, Dtype* top_data); + const Dtype* bottom_data, Dtype* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, Dtype* top_data); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data); template void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data); template void StoPoolForwardTrain(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* idx_data, Dtype* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data); template void StoPoolForwardTest(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, Dtype* bottom_diff); + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, - const int clnum, const int channels_, const int intheight_, - const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, const int pad_, Dtype* bottom_diff); + const int clnum, const int channels_, const int intheight_, + const int width_, const int pooled_height_, const int pooled_width_, + const int kernel_size_, const int stride_, const int pad_, + Dtype* bottom_diff); template void PReLUForward(const int count, const int channels, const int dim, - const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, - const int div_factor); + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor); template void PReLUBackward(const int count, const int channels, const int dim, - const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, - const Dtype* slope_data, const int div_factor); + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor); template void PReLUParamBackward(const int count, const Dtype* top_diff, - const int offset_out, const Dtype* bottom_data, const int offset_in, - Dtype* bottom_diff); + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff); template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, - Dtype negative_slope); + Dtype negative_slope); template void ReLUBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); template void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); template void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype *top_data); + const int* MaskMem, const Dtype scale_, Dtype *top_data); template void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff); + const float threshold_, const Dtype scale_, Dtype* bottom_diff); template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, - Dtype threshold); + Dtype threshold); template void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup); @@ -222,23 +221,22 @@ template void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V); template -void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y ); +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out); + const int spatial_dim, const Dtype* data, Dtype* out); template -void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data); +void kernel_channel_subtract(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data); template void kernel_powx(const int count, const Dtype* data, const Dtype alpha, - Dtype* out); + Dtype* out); template void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out); @@ -263,29 +261,28 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out); template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum); + const int spatial_dim, const Dtype* data, Dtype* channel_sum); template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data); + const int spatial_dim, const Dtype* channel_sum, Dtype* data); template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot); + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot); template -void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts); +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, + const Dtype* label, Dtype* loss, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts); + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); template void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); @@ -294,22 +291,21 @@ template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); template -void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale); +void LRNFillScale(const int nthreads, const Dtype* const in, const int num, + const int channels, const int height, const int width, const int size, + const Dtype alpha_over_size, const Dtype k, Dtype* const scale); template -void LRNComputeOutput(int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out); +void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale, + Dtype negative_beta, Dtype* out); template -void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff); +void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data, + const Dtype* const top_data, const Dtype* const scale, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int size, + const Dtype negative_beta, const Dtype cache_ratio, + Dtype* const bottom_diff); template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); @@ -321,28 +317,26 @@ void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); template void BNLLBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype *bottom_diff); + const Dtype* bottom_data, Dtype *bottom_diff); template void Concat(const int nthreads, const Dtype* in_data, const bool forward, - const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype *out_data); + const int num_concats, const int concat_size, const int top_concat_axis, + const int bottom_concat_axis, const int offset_concat_axis, + Dtype *out_data); template -void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff); +void CLLBackward(const int count, const int channels, const Dtype margin, + const bool legacy_version, const Dtype alpha, const Dtype* y, + const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff); template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask); + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, int* mask); template -void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff); +void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, + const int* mask, Dtype* bottom_diff); } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ // namespace caffe diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp index 7688e16a..febd932d 100644 --- a/include/caffe/util/rng.hpp +++ b/include/caffe/util/rng.hpp @@ -14,30 +14,29 @@ namespace caffe { typedef boost::mt19937 rng_t; inline rng_t* caffe_rng() { - return static_cast(Caffe::rng_stream().generator()); + return static_cast(Caffe::rng_stream().generator()); } // Fisher–Yates algorithm template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end, - RandomGenerator* gen) { - typedef typename std::iterator_traits::difference_type - difference_type; - typedef typename boost::uniform_int dist_type; - - difference_type length = std::distance(begin, end); - if (length <= 0) - return; - - for (difference_type i = length - 1; i > 0; --i) { - dist_type dist(0, i); - std::iter_swap(begin + i, begin + dist(*gen)); - } + RandomGenerator* gen) { + typedef typename std::iterator_traits::difference_type difference_type; + typedef typename boost::uniform_int dist_type; + + difference_type length = std::distance(begin, end); + if (length <= 0) + return; + + for (difference_type i = length - 1; i > 0; --i) { + dist_type dist(0, i); + std::iter_swap(begin + i, begin + dist(*gen)); + } } template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) { - shuffle(begin, end, caffe_rng()); + shuffle(begin, end, caffe_rng()); } } // namespace caffe diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp index 2dc3cceb..496ba1e0 100644 --- a/include/caffe/util/upgrade_proto.hpp +++ b/include/caffe/util/upgrade_proto.hpp @@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param); // taking its top blob as input. // Error if any of these above layers are not-conv layers. void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad); + NetParameter* param_upgraded_pad); // Upgrade a single V0LayerConnection to the V1LayerParameter format. bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param); + V1LayerParameter* layer_param); V1LayerParameter_LayerType UpgradeV0LayerType(const string& type); @@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param); bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param); bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param); + LayerParameter* layer_param); const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type); @@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param); // Read parameters from a file into a NetParameter proto message. void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 0c954fa2..bc6cd5de 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -22,149 +22,146 @@ namespace caffe { */ template class BaseConvolutionLayer: public Layer { - public: - explicit BaseConvolutionLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual ~BaseConvolutionLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int MinBottomBlobs() const { - return 1; - } - virtual inline int MinTopBlobs() const { - return 1; - } - virtual inline bool EqualNumBottomTopBlobs() const { - return true; - } - - protected: - // Helper functions that abstract away the column buffer and gemm arguments. - // The last argument in forward_cpu_gemm is so that we can skip the im2col if - // we just called weight_cpu_gemm with the same input. - void forward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_cpu_bias(Dtype* output, const Dtype* bias); - void backward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output); - void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* - weights); - void backward_cpu_bias(Dtype* bias, const Dtype* input); - //opencl related setup - void ocl_setup(); + public: + explicit BaseConvolutionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~BaseConvolutionLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline bool EqualNumBottomTopBlobs() const { + return true; + } + + protected: + // Helper functions that abstract away the column buffer and gemm arguments. + // The last argument in forward_cpu_gemm is so that we can skip the im2col if + // we just called weight_cpu_gemm with the same input. + void forward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_cpu_bias(Dtype* output, const Dtype* bias); + void backward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output); + void weight_cpu_gemm(const Dtype* input, const Dtype* output, + Dtype* weights); + void backward_cpu_bias(Dtype* bias, const Dtype* input); + //opencl related setup + void ocl_setup(); #ifndef CPU_ONLY - void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_bias(Dtype* output, const Dtype* bias); - void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); - void backward_gpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* col_output); - void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, - Dtype* col_output); - void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* - weights); - void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype* - weights); - void backward_gpu_bias(Dtype* bias, const Dtype* input); - #endif - - // reverse_dimensions should return true iff we are implementing deconv, so - // that conv helpers know which dimensions are which. - virtual bool reverse_dimensions() = 0; - // Compute height_out_ and width_out_ from other parameters. - virtual void compute_output_shape() = 0; - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int num_; - int channels_; - int pad_h_, pad_w_; - int height_, width_; - int group_; - int num_output_; - int height_out_, width_out_; - bool bias_term_; - bool is_1x1_; - - private: - // wrap im2col/col2im so we don't have to remember the (long) argument lists - inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { - im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); - } - inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { - col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); - } + void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_bias(Dtype* output, const Dtype* bias); + void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); + void backward_gpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, + Dtype* weights); + void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, + Dtype* weights); + void backward_gpu_bias(Dtype* bias, const Dtype* input); +#endif + + // reverse_dimensions should return true iff we are implementing deconv, so + // that conv helpers know which dimensions are which. + virtual bool reverse_dimensions() = 0; + // Compute height_out_ and width_out_ from other parameters. + virtual void compute_output_shape() = 0; + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int num_; + int channels_; + int pad_h_, pad_w_; + int height_, width_; + int group_; + int num_output_; + int height_out_, width_out_; + bool bias_term_; + bool is_1x1_; + + private: + // wrap im2col/col2im so we don't have to remember the (long) argument lists + inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { + im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); + } + inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { + col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + } #ifndef CPU_ONLY - inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { - im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, - 0); - } - inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, - bottom_offset_); - } - protected: - inline void conv_im2col_gpu_opt(const Dtype* data) { - im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2); - } - inline void conv_col2im_gpu_opt(Dtype* data) { - col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, - conv_in_width_, - kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2); - } - private: - inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { - transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_, - M_ * opt_num2, opt_num2); - } - inline void conv_transpose_gpu(const Dtype* data) { - opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, - opt_num2); - } - protected: - inline void gpu_memset(Dtype* data, Dtype value, int count) { - ocl_memset(data, value, count); - } + inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { + im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, col_buff, 0); + } + inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { + col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, data, bottom_offset_); + } + protected: + inline void conv_im2col_gpu_opt(const Dtype* data) { + im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, + opt_num2); + } + inline void conv_col2im_gpu_opt(Dtype* data) { + col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, + opt_num2); + } + private: + inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { + transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_, + M_ * opt_num2, opt_num2); + } + inline void conv_transpose_gpu(const Dtype* data) { + opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + } + protected: + inline void gpu_memset(Dtype* data, Dtype value, int count) { + ocl_memset(data, value, count); + } #endif - private: - int conv_out_channels_; - int conv_in_channels_; - int conv_out_spatial_dim_; - int conv_in_height_; - int conv_in_width_; - int kernel_dim_; + private: + int conv_out_channels_; + int conv_in_channels_; + int conv_out_spatial_dim_; + int conv_in_height_; + int conv_in_width_; + int kernel_dim_; - Blob col_buffer_; - Blob bias_multiplier_; + Blob col_buffer_; + Blob bias_multiplier_; //opencl related data structures - protected: - int opt_num2; - int M_, N_, K_; - int weight_offset_; - int col_offset_; - int output_offset_; - int top_offset_, top_offset_opt, bottom_offset_; - public: - static cl_mem subTopMem, transMem; - static size_t subtop_mem_size, trans_mem_size; + protected: + int opt_num2; + int M_, N_, K_; + int weight_offset_; + int col_offset_; + int output_offset_; + int top_offset_, top_offset_opt, bottom_offset_; + public: + static cl_mem subTopMem, transMem; + static size_t subtop_mem_size, trans_mem_size; }; /** @@ -185,66 +182,65 @@ class BaseConvolutionLayer: public Layer { */ template class ConvolutionLayer: public BaseConvolutionLayer { - public: - /** - * @param param provides ConvolutionParameter convolution_param, - * with ConvolutionLayer options: - * - num_output. The number of filters. - * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by - * kernel_size for square filters or kernel_h and kernel_w for rectangular - * filters. - * - stride / stride_h / stride_w (\b optional, default 1). The filter - * stride, given by stride_size for equal dimensions or stride_h and stride_w - * for different strides. By default the convolution is dense with stride 1. - * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for - * convolution, given by pad for equal dimensions or pad_h and pad_w for - * different padding. Input padding is computed implicitly instead of - * actually padding. - * - group (\b optional, default 1). The number of filter groups. Group - * convolution is a method for reducing parameterization by selectively - * connecting input and output channels. The input and output channel dimensions must be divisible - * by the number of groups. For group @f$ \geq 1 @f$, the - * convolutional filters' input and output channels are separated s.t. each - * group takes 1 / group of the input channels and makes 1 / group of the - * output channels. Concretely 4 input channels, 8 output channels, and - * 2 groups separate input channels 1-2 and output channels 1-4 into the - * first group and input channels 3-4 and output channels 5-8 into the second - * group. - * - bias_term (\b optional, default true). Whether to have a bias. - * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library - * kernels + stream parallelism) engines. - */ - explicit ConvolutionLayer(const LayerParameter& param) - : - BaseConvolutionLayer(param) { - } - - virtual inline const char* type() const { - return "Convolution"; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { - return false; - } - virtual void compute_output_shape(); - - virtual void Forward_gpu_org(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt2(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + public: + /** + * @param param provides ConvolutionParameter convolution_param, + * with ConvolutionLayer options: + * - num_output. The number of filters. + * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by + * kernel_size for square filters or kernel_h and kernel_w for rectangular + * filters. + * - stride / stride_h / stride_w (\b optional, default 1). The filter + * stride, given by stride_size for equal dimensions or stride_h and stride_w + * for different strides. By default the convolution is dense with stride 1. + * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for + * convolution, given by pad for equal dimensions or pad_h and pad_w for + * different padding. Input padding is computed implicitly instead of + * actually padding. + * - group (\b optional, default 1). The number of filter groups. Group + * convolution is a method for reducing parameterization by selectively + * connecting input and output channels. The input and output channel dimensions must be divisible + * by the number of groups. For group @f$ \geq 1 @f$, the + * convolutional filters' input and output channels are separated s.t. each + * group takes 1 / group of the input channels and makes 1 / group of the + * output channels. Concretely 4 input channels, 8 output channels, and + * 2 groups separate input channels 1-2 and output channels 1-4 into the + * first group and input channels 3-4 and output channels 5-8 into the second + * group. + * - bias_term (\b optional, default true). Whether to have a bias. + * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library + * kernels + stream parallelism) engines. + */ + explicit ConvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Convolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return false; + } + virtual void compute_output_shape(); + + virtual void Forward_gpu_org(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_org(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Forward_gpu_opt2(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_opt2(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -263,29 +259,28 @@ class ConvolutionLayer: public BaseConvolutionLayer { */ template class DeconvolutionLayer: public BaseConvolutionLayer { - public: - explicit DeconvolutionLayer(const LayerParameter& param) - : - BaseConvolutionLayer(param) { - } - - virtual inline const char* type() const { - return "Deconvolution"; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { - return true; - } - virtual void compute_output_shape(); + public: + explicit DeconvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Deconvolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return true; + } + virtual void compute_output_shape(); }; #ifdef USE_CUDNN @@ -305,31 +300,31 @@ class DeconvolutionLayer: public BaseConvolutionLayer { */ template class CuDNNConvolutionLayer : public ConvolutionLayer { - public: - explicit CuDNNConvolutionLayer(const LayerParameter& param) - : ConvolutionLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNConvolutionLayer(); - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t* handle_; - cudaStream_t* stream_; - vector bottom_descs_, top_descs_; - cudnnTensorDescriptor_t bias_desc_; - cudnnFilterDescriptor_t filter_desc_; - vector conv_descs_; - int bottom_offset_, top_offset_, weight_offset_, bias_offset_; - size_t workspaceSizeInBytes; - void *workspace; + public: + explicit CuDNNConvolutionLayer(const LayerParameter& param) + : ConvolutionLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNConvolutionLayer(); + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t* handle_; + cudaStream_t* stream_; + vector bottom_descs_, top_descs_; + cudnnTensorDescriptor_t bias_desc_; + cudnnFilterDescriptor_t filter_desc_; + vector conv_descs_; + int bottom_offset_, top_offset_, weight_offset_, bias_offset_; + size_t workspaceSizeInBytes; + void *workspace; }; #endif @@ -342,41 +337,40 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { */ template class Im2colLayer: public Layer { - public: - explicit Im2colLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Im2col"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int channels_; - int height_, width_; - int pad_h_, pad_w_; + public: + explicit Im2colLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Im2col"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int channels_; + int height_, width_; + int pad_h_, pad_w_; }; // Forward declare PoolingLayer and SplitLayer for use in LRNLayer. @@ -390,80 +384,79 @@ template class SplitLayer; */ template class LRNLayer: public Layer { - public: - explicit LRNLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "LRN"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int ExactNumTopBlobs() const { - return 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - virtual void CrossChannelForward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void WithinChannelForward(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelBackward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void WithinChannelBackward(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int size_; - int pre_pad_; - Dtype alpha_; - Dtype beta_; - Dtype k_; - int num_; - int channels_; - int height_; - int width_; - - // Fields used for normalization ACROSS_CHANNELS - // scale_ stores the intermediate summing results - Blob scale_; - - // Fields used for normalization WITHIN_CHANNEL - shared_ptr > split_layer_; - vector*> split_top_vec_; - shared_ptr > square_layer_; - Blob square_input_; - Blob square_output_; - vector*> square_bottom_vec_; - vector*> square_top_vec_; - shared_ptr > pool_layer_; - Blob pool_output_; - vector*> pool_top_vec_; - shared_ptr > power_layer_; - Blob power_output_; - vector*> power_top_vec_; - shared_ptr > product_layer_; - Blob product_input_; - vector*> product_bottom_vec_; + public: + explicit LRNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "LRN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + virtual void CrossChannelForward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void WithinChannelForward(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int size_; + int pre_pad_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int num_; + int channels_; + int height_; + int width_; + + // Fields used for normalization ACROSS_CHANNELS + // scale_ stores the intermediate summing results + Blob scale_; + + // Fields used for normalization WITHIN_CHANNEL + shared_ptr > split_layer_; + vector*> split_top_vec_; + shared_ptr > square_layer_; + Blob square_input_; + Blob square_output_; + vector*> square_bottom_vec_; + vector*> square_top_vec_; + shared_ptr > pool_layer_; + Blob pool_output_; + vector*> pool_top_vec_; + shared_ptr > power_layer_; + Blob power_output_; + vector*> power_top_vec_; + shared_ptr > product_layer_; + Blob product_input_; + vector*> product_bottom_vec_; }; @@ -474,51 +467,51 @@ class LRNLayer: public Layer { */ template class PoolingLayer: public Layer { - public: - explicit PoolingLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "Pooling"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int MinTopBlobs() const { - return 1; - } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - Blob rand_idx_; - Blob max_idx_; + public: + explicit PoolingLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Pooling"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return + (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int pad_h_, pad_w_; + int channels_; + int height_, width_; + int pooled_height_, pooled_width_; + bool global_pooling_; + Blob rand_idx_; + Blob max_idx_; }; @@ -529,29 +522,29 @@ class PoolingLayer: public Layer { */ template class CuDNNPoolingLayer : public PoolingLayer { - public: - explicit CuDNNPoolingLayer(const LayerParameter& param) - : PoolingLayer(param), handles_setup_(false) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - virtual ~CuDNNPoolingLayer(); - // Currently, cuDNN does not support the extra top blob. - virtual inline int MinTopBlobs() const {return -1;} - virtual inline int ExactNumTopBlobs() const {return 1;} - - protected: - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool handles_setup_; - cudnnHandle_t handle_; - cudnnTensorDescriptor_t bottom_desc_, top_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - cudnnPoolingMode_t mode_; + public: + explicit CuDNNPoolingLayer(const LayerParameter& param) + : PoolingLayer(param), handles_setup_(false) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual ~CuDNNPoolingLayer(); + // Currently, cuDNN does not support the extra top blob. + virtual inline int MinTopBlobs() const {return -1;} + virtual inline int ExactNumTopBlobs() const {return 1;} + + protected: + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool handles_setup_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t bottom_desc_, top_desc_; + cudnnPoolingDescriptor_t pooling_desc_; + cudnnPoolingMode_t mode_; }; #endif @@ -563,70 +556,70 @@ class CuDNNPoolingLayer : public PoolingLayer { */ template class SPPLayer: public Layer { - public: - explicit SPPLayer(const LayerParameter& param) - : - Layer(param) { - } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { - return "SPP"; - } - virtual inline int ExactNumBottomBlobs() const { - return 1; - } - virtual inline int MinTopBlobs() const { - return 1; - } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - // calculates the kernel and stride dimensions for the pooling layer, - // returns a correctly configured LayerParameter for a PoolingLayer - virtual LayerParameter GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param); - - int pyramid_height_; - int bottom_h_, bottom_w_; - int channels_; - int kernel_h_, kernel_w_; - int pad_h_, pad_w_; - - /// the internal Split layer that feeds the pooling layers - shared_ptr > split_layer_; - /// top vector holder used in call to the underlying SplitLayer::Forward - vector*> split_top_vec_; - /// bottom vector holder used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_bottom_vecs_; - /// the internal Pooling layers of different kernel sizes - vector > > pooling_layers_; - /// top vector holders used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_top_vecs_; - /// pooling_outputs stores the outputs of the PoolingLayers - vector*> pooling_outputs_; - /// the internal Flatten layers that the Pooling layers feed into - vector*> flatten_layers_; - /// top vector holders used in call to the underlying FlattenLayer::Forward - vector*>*> flatten_top_vecs_; - /// flatten_outputs stores the outputs of the FlattenLayers - vector*> flatten_outputs_; - /// bottom vector holder used in call to the underlying ConcatLayer::Forward - vector*> concat_bottom_vec_; - /// the internal Concat layers that the Flatten layers feed into - shared_ptr > concat_layer_; + public: + explicit SPPLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SPP"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return + (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + // calculates the kernel and stride dimensions for the pooling layer, + // returns a correctly configured LayerParameter for a PoolingLayer + virtual LayerParameter GetPoolingParam(const int pyramid_level, + const int bottom_h, const int bottom_w, const SPPParameter spp_param); + + int pyramid_height_; + int bottom_h_, bottom_w_; + int channels_; + int kernel_h_, kernel_w_; + int pad_h_, pad_w_; + + /// the internal Split layer that feeds the pooling layers + shared_ptr > split_layer_; + /// top vector holder used in call to the underlying SplitLayer::Forward + vector*> split_top_vec_; + /// bottom vector holder used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_bottom_vecs_; + /// the internal Pooling layers of different kernel sizes + vector > > pooling_layers_; + /// top vector holders used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_top_vecs_; + /// pooling_outputs stores the outputs of the PoolingLayers + vector*> pooling_outputs_; + /// the internal Flatten layers that the Pooling layers feed into + vector*> flatten_layers_; + /// top vector holders used in call to the underlying FlattenLayer::Forward + vector*>*> flatten_top_vecs_; + /// flatten_outputs stores the outputs of the FlattenLayers + vector*> flatten_outputs_; + /// bottom vector holder used in call to the underlying ConcatLayer::Forward + vector*> concat_bottom_vec_; + /// the internal Concat layers that the Flatten layers feed into + shared_ptr > concat_layer_; }; } // namespace caffe diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 5e327c67..089899fc 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -10,505 +10,497 @@ namespace caffe { template void Blob::Reshape(const int num, const int channels, const int height, - const int width) { - vector shape(4); - shape[0] = num; - shape[1] = channels; - shape[2] = height; - shape[3] = width; - Reshape(shape); + const int width) { + vector shape(4); + shape[0] = num; + shape[1] = channels; + shape[2] = height; + shape[3] = width; + Reshape(shape); } template void Blob::Reshape(const vector& shape) { - CHECK_LE(shape.size(), kMaxBlobAxes); - count_ = 1; - shape_.resize(shape.size()); - for (int i = 0; i < shape.size(); ++i) { - CHECK_GE(shape[i], 0); - CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; - count_ *= shape[i]; - shape_[i] = shape[i]; - } - if (count_ > capacity_) { - capacity_ = count_; - data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); - diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); - } + CHECK_LE(shape.size(), kMaxBlobAxes); + count_ = 1; + shape_.resize(shape.size()); + for (int i = 0; i < shape.size(); ++i) { + CHECK_GE(shape[i], 0); + CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; + count_ *= shape[i]; + shape_[i] = shape[i]; + } + if (count_ > capacity_) { + capacity_ = count_; + data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); + diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype))); + } } template void Blob::Reshape(const BlobShape& shape) { - CHECK_LE(shape.dim_size(), kMaxBlobAxes); - vector shape_vec(shape.dim_size()); - for (int i = 0; i < shape.dim_size(); ++i) { - shape_vec[i] = shape.dim(i); - } - Reshape(shape_vec); + CHECK_LE(shape.dim_size(), kMaxBlobAxes); + vector shape_vec(shape.dim_size()); + for (int i = 0; i < shape.dim_size(); ++i) { + shape_vec[i] = shape.dim(i); + } + Reshape(shape_vec); } template void Blob::ReshapeLike(const Blob& other) { - Reshape(other.shape()); + Reshape(other.shape()); } template Blob::Blob(const int num, const int channels, const int height, - const int width) -// capacity_ must be initialized before calling Reshape -: - capacity_(0) { - Reshape(num, channels, height, width); + const int width) + : capacity_(0) { + Reshape(num, channels, height, width); } template Blob::Blob(const vector& shape) -// capacity_ must be initialized before calling Reshape -: - capacity_(0) { - Reshape(shape); + : capacity_(0) { + Reshape(shape); } template const Dtype* Blob::cpu_data() const { - CHECK (data_); - return (const Dtype*) data_->cpu_data(); + CHECK (data_); + return (const Dtype*) data_->cpu_data(); } template void Blob::set_cpu_data(Dtype* data) { - CHECK(data); - data_->set_cpu_data(data); + CHECK(data); + data_->set_cpu_data(data); } template const Dtype* Blob::gpu_data() const { - CHECK (data_); - return (const Dtype*) data_->gpu_data(); + CHECK (data_); + return (const Dtype*) data_->gpu_data(); } template const Dtype* Blob::gpu_cache_data() const { - CHECK (data_); - return (const Dtype*) data_->gpu_cache_data(); + CHECK (data_); + return (const Dtype*) data_->gpu_cache_data(); } template const Dtype* Blob::cpu_diff() const { - CHECK (diff_); - return (const Dtype*) diff_->cpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->cpu_data(); } template const Dtype* Blob::gpu_diff() const { - CHECK (diff_); - return (const Dtype*) diff_->gpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->gpu_data(); } template Dtype* Blob::mutable_cpu_data() { - CHECK (data_); - return static_cast(data_->mutable_cpu_data()); + CHECK (data_); + return static_cast(data_->mutable_cpu_data()); } template Dtype* Blob::mutable_gpu_data() { - CHECK (data_); - return static_cast(data_->mutable_gpu_data()); + CHECK (data_); + return static_cast(data_->mutable_gpu_data()); } template Dtype* Blob::mutable_cpu_diff() { - CHECK (diff_); - return static_cast(diff_->mutable_cpu_data()); + CHECK (diff_); + return static_cast(diff_->mutable_cpu_data()); } template Dtype* Blob::mutable_gpu_diff() { - CHECK (diff_); - return static_cast(diff_->mutable_gpu_data()); + CHECK (diff_); + return static_cast(diff_->mutable_gpu_data()); } template void Blob::ShareData(const Blob& other) { - CHECK_EQ(count_, other.count()); - data_ = other.data(); + CHECK_EQ(count_, other.count()); + data_ = other.data(); } template void Blob::ShareDiff(const Blob& other) { - CHECK_EQ(count_, other.count()); - diff_ = other.diff(); + CHECK_EQ(count_, other.count()); + diff_ = other.diff(); } // The "update" method is used for parameter blobs in a Net, which are stored // as Blob or Blob -- hence we do not define it for // Blob or Blob. template <> void Blob::Update() { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template <> void Blob::Update() { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template void Blob::Update() { - // We will perform update based on where the data is located. - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - // perform computation on CPU - caffe_axpy < Dtype > (count_, Dtype(-1), - static_cast(diff_->cpu_data()), - static_cast(data_->mutable_cpu_data())); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - // perform computation on GPU - caffe_gpu_axpy < Dtype > (count_, Dtype(-1), - static_cast(diff_->gpu_data()), - static_cast(data_->mutable_gpu_data())); + // We will perform update based on where the data is located. + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + // perform computation on CPU + caffe_axpy < Dtype + > (count_, Dtype(-1), static_cast(diff_->cpu_data()), static_cast(data_->mutable_cpu_data())); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + // perform computation on GPU + caffe_gpu_axpy < Dtype + > (count_, Dtype(-1), static_cast(diff_->gpu_data()), static_cast(data_->mutable_gpu_data())); #else - NO_GPU; + NO_GPU; #endif - break; - default: - LOG(FATAL) << "Syncedmem not initialized."; - } + break; + default: + LOG(FATAL) << "Syncedmem not initialized."; + } } template <> unsigned int Blob::asum_data() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template <> int Blob::asum_data() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template Dtype Blob::asum_data() const { - if (!data_) { - return 0; - } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - return caffe_cpu_asum(count_, cpu_data()); - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - { - Dtype asum; - caffe_gpu_asum(count_, gpu_data(), &asum); - return asum; - } + if (!data_) { + return 0; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_data()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_data(), &asum); + return asum; + } #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return 0; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return 0; } template <> unsigned int Blob::asum_diff() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template <> int Blob::asum_diff() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template Dtype Blob::asum_diff() const { - if (!diff_) { - return 0; - } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - return caffe_cpu_asum(count_, cpu_diff()); - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - { - Dtype asum; - caffe_gpu_asum(count_, gpu_diff(), &asum); - return asum; - } + if (!diff_) { + return 0; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_diff()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_diff(), &asum); + return asum; + } #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); - } - return 0; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); + } + return 0; } template <> unsigned int Blob::sumsq_data() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template <> int Blob::sumsq_data() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template Dtype Blob::sumsq_data() const { - Dtype sumsq; - const Dtype* data; - if (!data_) { - return 0; - } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - data = cpu_data(); - sumsq = caffe_cpu_dot(count_, data, data); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - data = gpu_data(); - caffe_gpu_dot(count_, data, data, &sumsq); + Dtype sumsq; + const Dtype* data; + if (!data_) { + return 0; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + data = cpu_data(); + sumsq = caffe_cpu_dot(count_, data, data); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + data = gpu_data(); + caffe_gpu_dot(count_, data, data, &sumsq); #else - NO_GPU; + NO_GPU; #endif - break; - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return sumsq; + break; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return sumsq; } template <> unsigned int Blob::sumsq_diff() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template <> int Blob::sumsq_diff() const { - NOT_IMPLEMENTED; - return 0; + NOT_IMPLEMENTED; + return 0; } template Dtype Blob::sumsq_diff() const { - Dtype sumsq; - const Dtype* diff; - if (!diff_) { - return 0; - } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - diff = cpu_diff(); - sumsq = caffe_cpu_dot(count_, diff, diff); - break; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - diff = gpu_diff(); - caffe_gpu_dot(count_, diff, diff, &sumsq); - break; + Dtype sumsq; + const Dtype* diff; + if (!diff_) { + return 0; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + diff = cpu_diff(); + sumsq = caffe_cpu_dot(count_, diff, diff); + break; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + diff = gpu_diff(); + caffe_gpu_dot(count_, diff, diff, &sumsq); + break; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return 0; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } - return sumsq; + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return sumsq; } template <> void Blob::scale_data(unsigned int scale_factor) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template <> void Blob::scale_data(int scale_factor) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template void Blob::scale_data(Dtype scale_factor) { - Dtype* data; - if (!data_) { - return; - } - switch (data_->head()) { - case SyncedMemory::HEAD_AT_CPU: - data = mutable_cpu_data(); - caffe_scal(count_, scale_factor, data); - return; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - data = mutable_gpu_data(); - caffe_gpu_scal(count_, scale_factor, data); - return; + Dtype* data; + if (!data_) { + return; + } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + data = mutable_cpu_data(); + caffe_scal(count_, scale_factor, data); + return; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + data = mutable_gpu_data(); + caffe_gpu_scal(count_, scale_factor, data); + return; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); - } + case SyncedMemory::UNINITIALIZED: + return; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } } template <> void Blob::scale_diff(unsigned int scale_factor) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template <> void Blob::scale_diff(int scale_factor) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; } template void Blob::scale_diff(Dtype scale_factor) { - Dtype* diff; - if (!diff_) { - return; - } - switch (diff_->head()) { - case SyncedMemory::HEAD_AT_CPU: - diff = mutable_cpu_diff(); - caffe_scal(count_, scale_factor, diff); - return; - case SyncedMemory::HEAD_AT_GPU: - case SyncedMemory::SYNCED: - #ifndef CPU_ONLY - diff = mutable_gpu_diff(); - caffe_gpu_scal(count_, scale_factor, diff); - return; + Dtype* diff; + if (!diff_) { + return; + } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + diff = mutable_cpu_diff(); + caffe_scal(count_, scale_factor, diff); + return; + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + diff = mutable_gpu_diff(); + caffe_gpu_scal(count_, scale_factor, diff); + return; #else - NO_GPU; + NO_GPU; #endif - case SyncedMemory::UNINITIALIZED: - return; - default: - LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); - } + case SyncedMemory::UNINITIALIZED: + return; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); + } } template bool Blob::ShapeEquals(const BlobProto& other) { - if (other.has_num() || other.has_channels() || - other.has_height() || other.has_width()) { - // Using deprecated 4D Blob dimensions -- - // shape is (num, channels, height, width). - // Note: we do not use the normal Blob::num(), Blob::channels(), etc. - // methods as these index from the beginning of the blob shape, where legacy - // parameter blobs were indexed from the end of the blob shape (e.g., bias - // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). - return shape_.size() <= 4 && - LegacyShape(-4) == other.num() && - LegacyShape(-3) == other.channels() && - LegacyShape(-2) == other.height() && - LegacyShape(-1) == other.width(); - } - vector other_shape(other.shape().dim_size()); - for (int i = 0; i < other.shape().dim_size(); ++i) { - other_shape[i] = other.shape().dim(i); - } - return shape_ == other_shape; + if (other.has_num() || other.has_channels() || other.has_height() + || other.has_width()) { + // Using deprecated 4D Blob dimensions -- + // shape is (num, channels, height, width). + // Note: we do not use the normal Blob::num(), Blob::channels(), etc. + // methods as these index from the beginning of the blob shape, where legacy + // parameter blobs were indexed from the end of the blob shape (e.g., bias + // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). + return shape_.size() <= 4 && LegacyShape(-4) == other.num() + && LegacyShape(-3) == other.channels() + && LegacyShape(-2) == other.height() && LegacyShape(-1) == other.width(); + } + vector other_shape(other.shape().dim_size()); + for (int i = 0; i < other.shape().dim_size(); ++i) { + other_shape[i] = other.shape().dim(i); + } + return shape_ == other_shape; } template void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { - if (source.count() != count_ || source.shape() != shape_) { - if (reshape) { - ReshapeLike(source); - } else { - LOG(FATAL) << "Trying to copy blobs of different sizes."; - } - } - switch (Caffe::mode()) { - case Caffe::GPU: - if (copy_diff) { - caffe_copy(count_, source.gpu_diff(), - static_cast(diff_->mutable_gpu_data())); - } else { - caffe_copy(count_, source.gpu_data(), - static_cast(data_->mutable_gpu_data())); - } - break; - case Caffe::CPU: - if (copy_diff) { - caffe_copy(count_, source.cpu_diff(), - static_cast(diff_->mutable_cpu_data())); - } else { - caffe_copy(count_, source.cpu_data(), - static_cast(data_->mutable_cpu_data())); - } - break; - default: - LOG(FATAL) << "Unknown caffe mode."; - } + if (source.count() != count_ || source.shape() != shape_) { + if (reshape) { + ReshapeLike(source); + } else { + LOG(FATAL) << "Trying to copy blobs of different sizes."; + } + } + switch (Caffe::mode()) { + case Caffe::GPU: + if (copy_diff) { + caffe_copy(count_, source.gpu_diff(), + static_cast(diff_->mutable_gpu_data())); + } else { + caffe_copy(count_, source.gpu_data(), + static_cast(data_->mutable_gpu_data())); + } + break; + case Caffe::CPU: + if (copy_diff) { + caffe_copy(count_, source.cpu_diff(), + static_cast(diff_->mutable_cpu_data())); + } else { + caffe_copy(count_, source.cpu_data(), + static_cast(data_->mutable_cpu_data())); + } + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } } template void Blob::FromProto(const BlobProto& proto, bool reshape) { - if (reshape) { - vector shape; - if (proto.has_num() || proto.has_channels() || - proto.has_height() || proto.has_width()) { - // Using deprecated 4D Blob dimensions -- - // shape is (num, channels, height, width). - shape.resize(4); - shape[0] = proto.num(); - shape[1] = proto.channels(); - shape[2] = proto.height(); - shape[3] = proto.width(); - } else { - shape.resize(proto.shape().dim_size()); - for (int i = 0; i < proto.shape().dim_size(); ++i) { - shape[i] = proto.shape().dim(i); - } - } - Reshape(shape); - } else { - CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)"; - } - // copy data - Dtype* data_vec = mutable_cpu_data(); - for (int i = 0; i < count_; ++i) { - data_vec[i] = proto.data(i); - } - if (proto.diff_size() > 0) { - Dtype* diff_vec = mutable_cpu_diff(); - for (int i = 0; i < count_; ++i) { - diff_vec[i] = proto.diff(i); - } - } + if (reshape) { + vector shape; + if (proto.has_num() || proto.has_channels() || proto.has_height() + || proto.has_width()) { + // Using deprecated 4D Blob dimensions -- + // shape is (num, channels, height, width). + shape.resize(4); + shape[0] = proto.num(); + shape[1] = proto.channels(); + shape[2] = proto.height(); + shape[3] = proto.width(); + } else { + shape.resize(proto.shape().dim_size()); + for (int i = 0; i < proto.shape().dim_size(); ++i) { + shape[i] = proto.shape().dim(i); + } + } + Reshape(shape); + } else { + CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)"; + } + // copy data + Dtype* data_vec = mutable_cpu_data(); + for (int i = 0; i < count_; ++i) { + data_vec[i] = proto.data(i); + } + if (proto.diff_size() > 0) { + Dtype* diff_vec = mutable_cpu_diff(); + for (int i = 0; i < count_; ++i) { + diff_vec[i] = proto.diff(i); + } + } } template void Blob::ToProto(BlobProto* proto, bool write_diff) const { - proto->clear_shape(); - for (int i = 0; i < shape_.size(); ++i) { - proto->mutable_shape()->add_dim(shape_[i]); - } - proto->clear_data(); - proto->clear_diff(); - const Dtype* data_vec = cpu_data(); - for (int i = 0; i < count_; ++i) { - proto->add_data(data_vec[i]); - } - if (write_diff) { - const Dtype* diff_vec = cpu_diff(); - for (int i = 0; i < count_; ++i) { - proto->add_diff(diff_vec[i]); - } - } + proto->clear_shape(); + for (int i = 0; i < shape_.size(); ++i) { + proto->mutable_shape()->add_dim(shape_[i]); + } + proto->clear_data(); + proto->clear_diff(); + const Dtype* data_vec = cpu_data(); + for (int i = 0; i < count_; ++i) { + proto->add_data(data_vec[i]); + } + if (write_diff) { + const Dtype* diff_vec = cpu_diff(); + for (int i = 0; i < count_; ++i) { + proto->add_diff(diff_vec[i]); + } + } } INSTANTIATE_CLASS (Blob); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 2157c96a..2698ffee 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -11,36 +11,36 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { - //To fix: for now we use fixed seed to get same result each time - /* - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - - LOG(INFO) << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) - fclose(f); - - pid = getpid(); - s = time(NULL); - seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); - //return seed; - LOG(WARNING) << "return fixed seed 37"; - */ - return 37; + //To fix: for now we use fixed seed to get same result each time + /* + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + + LOG(INFO) << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + + pid = getpid(); + s = time(NULL); + seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); + //return seed; + LOG(WARNING) << "return fixed seed 37"; + */ + return 37; } void GlobalInit(int* pargc, char*** pargv) { - // Google flags. - ::gflags::ParseCommandLineFlags(pargc, pargv, true); - // Google logging. - ::google::InitGoogleLogging(*(pargv)[0]); - // Provide a backtrace on segfault. - ::google::InstallFailureSignalHandler(); + // Google flags. + ::gflags::ParseCommandLineFlags(pargc, pargv, true); + // Google logging. + ::google::InitGoogleLogging(*(pargv)[0]); + // Provide a backtrace on segfault. + ::google::InstallFailureSignalHandler(); } #ifdef CPU_ONLY // CPU-only Caffe. @@ -53,25 +53,25 @@ Caffe::~Caffe() { } void Caffe::set_random_seed(const unsigned int seed) { - // RNG seed - Get().random_generator_.reset(new RNG(seed)); + // RNG seed + Get().random_generator_.reset(new RNG(seed)); } void Caffe::SetDevice(const int device_id) { - NO_GPU; + NO_GPU; } void Caffe::DeviceQuery() { - NO_GPU; + NO_GPU; } class Caffe::RNG::Generator { - public: - Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} - explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} - caffe::rng_t* rng() {return rng_.get();} - private: - shared_ptr rng_; + public: + Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} + explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} + caffe::rng_t* rng() {return rng_.get();} + private: + shared_ptr rng_; }; Caffe::RNG::RNG() : generator_(new Generator()) {} @@ -79,79 +79,74 @@ Caffe::RNG::RNG() : generator_(new Generator()) {} Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {} Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { - generator_ = other.generator_; - return *this; + generator_ = other.generator_; + return *this; } void* Caffe::RNG::generator() { - return static_cast(generator_->rng()); + return static_cast(generator_->rng()); } #else // Normal GPU + CPU Caffe. -Caffe::Caffe() -{ - amdDevice.Init(); - cl_int err = clblasSetup(); - if (err != CL_SUCCESS) { - LOG(ERROR) << "clBLAS setup failed " << err; - } +Caffe::Caffe() { + amdDevice.Init(); + cl_int err = clblasSetup(); + if (err != CL_SUCCESS) { + LOG(ERROR) << "clBLAS setup failed " << err; + } } Caffe::~Caffe() { - clblasTeardown(); + clblasTeardown(); } void Caffe::set_random_seed(const unsigned int seed) { - // RNG seed - Get().random_generator_.reset(new RNG(seed)); + // RNG seed + Get().random_generator_.reset(new RNG(seed)); } void Caffe::SetDevice(const int device_id) { - if (amdDevice.GetDevice() == device_id) { - return; - } - amdDevice.Init(device_id); + if (amdDevice.GetDevice() == device_id) { + return; + } + amdDevice.Init(device_id); } void Caffe::DeviceQuery() { - amdDevice.DeviceQuery(); + amdDevice.DeviceQuery(); } class Caffe::RNG::Generator { - public: - Generator() - : - rng_(new caffe::rng_t(cluster_seedgen())) { - } - explicit Generator(unsigned int seed) - : - rng_(new caffe::rng_t(seed)) { - } - caffe::rng_t* rng() { - return rng_.get(); - } - private: - shared_ptr rng_; + public: + Generator() + : rng_(new caffe::rng_t(cluster_seedgen())) { + } + explicit Generator(unsigned int seed) + : rng_(new caffe::rng_t(seed)) { + } + caffe::rng_t* rng() { + return rng_.get(); + } + private: + shared_ptr rng_; }; Caffe::RNG::RNG() -: - generator_(new Generator()) { + : generator_(new Generator()) { } Caffe::RNG::RNG(unsigned int seed) -: - generator_(new Generator(seed)) { + : generator_(new Generator(seed)) { } Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { - generator_.reset(other.generator_.get()); - return *this; + generator_.reset(other.generator_.get()); + return *this; } void* Caffe::RNG::generator() { - return static_cast(generator_->rng()); + return static_cast(generator_->rng()); } #endif // CPU_ONLY diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index a041e126..1137bac3 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -12,520 +12,516 @@ namespace caffe { template DataTransformer::DataTransformer(const TransformationParameter& param, - Phase phase) -: - param_(param), phase_(phase) { - // check if we want to use mean_file - if (param_.has_mean_file()) { - CHECK_EQ(param_.mean_value_size(), 0) << - "Cannot specify mean_file and mean_value at the same time"; - const string& mean_file = param.mean_file(); - LOG(INFO) << "Loading mean file from: " << mean_file; - BlobProto blob_proto; - ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); - data_mean_.FromProto(blob_proto); - } - // check if we want to use mean_value - if (param_.mean_value_size() > 0) { - CHECK(param_.has_mean_file() == false) << - "Cannot specify mean_file and mean_value at the same time"; - for (int c = 0; c < param_.mean_value_size(); ++c) { - mean_values_.push_back(param_.mean_value(c)); - } - } + Phase phase) + : param_(param), phase_(phase) { + // check if we want to use mean_file + if (param_.has_mean_file()) { + CHECK_EQ(param_.mean_value_size(), 0) + << "Cannot specify mean_file and mean_value at the same time"; + const string& mean_file = param.mean_file(); + LOG(INFO) << "Loading mean file from: " << mean_file; + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + data_mean_.FromProto(blob_proto); + } + // check if we want to use mean_value + if (param_.mean_value_size() > 0) { + CHECK(param_.has_mean_file() == false) + << "Cannot specify mean_file and mean_value at the same time"; + for (int c = 0; c < param_.mean_value_size(); ++c) { + mean_values_.push_back(param_.mean_value(c)); + } + } } template void DataTransformer::Transform(const Datum& datum, - Dtype* transformed_data) { - const string& data = datum.data(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - - const int crop_size = param_.crop_size(); - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_uint8 = data.size() > 0; - const bool has_mean_values = mean_values_.size() > 0; - - CHECK_GT(datum_channels, 0); - CHECK_GE(datum_height, crop_size); - CHECK_GE(datum_width, crop_size); - - Dtype* mean = NULL; - if (has_mean_file) { - CHECK_EQ(datum_channels, data_mean_.channels()); - CHECK_EQ(datum_height, data_mean_.height()); - CHECK_EQ(datum_width, data_mean_.width()); - mean = data_mean_.mutable_cpu_data(); - } - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) << - "Specify either 1 mean_value or as many as channels: " - << datum_channels; - if (datum_channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < datum_channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } - - int height = datum_height; - int width = datum_width; - - int h_off = 0; - int w_off = 0; - if (crop_size) { - height = crop_size; - width = crop_size; - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(datum_height - crop_size + 1); - w_off = Rand(datum_width - crop_size + 1); - } else { - h_off = (datum_height - crop_size) / 2; - w_off = (datum_width - crop_size) / 2; - } - } - - Dtype datum_element; - int top_index, data_index; - for (int c = 0; c < datum_channels; ++c) { - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - data_index = (c * datum_height + h_off + h) * datum_width + w_off + w; - if (do_mirror) { - top_index = (c * height + h) * width + (width - 1 - w); - } else { - top_index = (c * height + h) * width + w; - } - if (has_uint8) { - datum_element = - static_cast(static_cast(data[data_index])); - } else { - datum_element = datum.float_data(data_index); - } - if (has_mean_file) { - transformed_data[top_index] = - (datum_element - mean[data_index]) * scale; - } else { - if (has_mean_values) { - transformed_data[top_index] = - (datum_element - mean_values_[c]) * scale; - } else { - transformed_data[top_index] = datum_element * scale; - } - } - } - } - } + Dtype* transformed_data) { + const string& data = datum.data(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + + const int crop_size = param_.crop_size(); + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_uint8 = data.size() > 0; + const bool has_mean_values = mean_values_.size() > 0; + + CHECK_GT(datum_channels, 0); + CHECK_GE(datum_height, crop_size); + CHECK_GE(datum_width, crop_size); + + Dtype* mean = NULL; + if (has_mean_file) { + CHECK_EQ(datum_channels, data_mean_.channels()); + CHECK_EQ(datum_height, data_mean_.height()); + CHECK_EQ(datum_width, data_mean_.width()); + mean = data_mean_.mutable_cpu_data(); + } + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) + << "Specify either 1 mean_value or as many as channels: " + << datum_channels; + if (datum_channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < datum_channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } + + int height = datum_height; + int width = datum_width; + + int h_off = 0; + int w_off = 0; + if (crop_size) { + height = crop_size; + width = crop_size; + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(datum_height - crop_size + 1); + w_off = Rand(datum_width - crop_size + 1); + } else { + h_off = (datum_height - crop_size) / 2; + w_off = (datum_width - crop_size) / 2; + } + } + + Dtype datum_element; + int top_index, data_index; + for (int c = 0; c < datum_channels; ++c) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + data_index = (c * datum_height + h_off + h) * datum_width + w_off + w; + if (do_mirror) { + top_index = (c * height + h) * width + (width - 1 - w); + } else { + top_index = (c * height + h) * width + w; + } + if (has_uint8) { + datum_element = + static_cast(static_cast(data[data_index])); + } else { + datum_element = datum.float_data(data_index); + } + if (has_mean_file) { + transformed_data[top_index] = (datum_element - mean[data_index]) + * scale; + } else { + if (has_mean_values) { + transformed_data[top_index] = (datum_element - mean_values_[c]) + * scale; + } else { + transformed_data[top_index] = datum_element * scale; + } + } + } + } + } } template void DataTransformer::Transform(const Datum& datum, - Blob* transformed_blob) { - - // If datum is encoded, decoded and transform the cv::image. - if (datum.encoded()) { - CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; - cv::Mat cv_img; - if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. - cv_img = DecodeDatumToCVMat(datum, param_.force_color()); - } else { - cv_img = DecodeDatumToCVMatNative(datum); - } - // Transform the cv::image into blob. - return Transform(cv_img, transformed_blob); - } else { - if (param_.force_color() || param_.force_gray()) { - LOG(ERROR) << "force_color and force_gray only for encoded datum"; - } - } - - const int crop_size = param_.crop_size(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - - // Check dimensions. - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int num = transformed_blob->num(); - - CHECK_EQ(channels, datum_channels); - CHECK_LE(height, datum_height); - CHECK_LE(width, datum_width); - CHECK_GE(num, 1); - - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - } else { - CHECK_EQ(datum_height, height); - CHECK_EQ(datum_width, width); - } - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - Transform(datum, transformed_data); + Blob* transformed_blob) { + + // If datum is encoded, decoded and transform the cv::image. + if (datum.encoded()) { + CHECK(!(param_.force_color() && param_.force_gray())) + << "cannot set both force_color and force_gray"; + cv::Mat cv_img; + if (param_.force_color() || param_.force_gray()) { + // If force_color then decode in color otherwise decode in gray. + cv_img = DecodeDatumToCVMat(datum, param_.force_color()); + } else { + cv_img = DecodeDatumToCVMatNative(datum); + } + // Transform the cv::image into blob. + return Transform(cv_img, transformed_blob); + } else { + if (param_.force_color() || param_.force_gray()) { + LOG(ERROR) << "force_color and force_gray only for encoded datum"; + } + } + + const int crop_size = param_.crop_size(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + + // Check dimensions. + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int num = transformed_blob->num(); + + CHECK_EQ(channels, datum_channels); + CHECK_LE(height, datum_height); + CHECK_LE(width, datum_width); + CHECK_GE(num, 1); + + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + } else { + CHECK_EQ(datum_height, height); + CHECK_EQ(datum_width, width); + } + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + Transform(datum, transformed_data); } template void DataTransformer::Transform(const vector & datum_vector, - Blob* transformed_blob) { - const int datum_num = datum_vector.size(); - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - - CHECK_GT(datum_num, 0) << "There is no datum to add"; - CHECK_LE(datum_num, num) - << - "The size of datum_vector must be no greater than transformed_blob->num()"; - Blob < Dtype > uni_blob(1, channels, height, width); - for (int item_id = 0; item_id < datum_num; ++item_id) { - int offset = transformed_blob->offset(item_id); - uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); - Transform(datum_vector[item_id], &uni_blob); - } + Blob* transformed_blob) { + const int datum_num = datum_vector.size(); + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + + CHECK_GT(datum_num, 0) << "There is no datum to add"; + CHECK_LE(datum_num, num) + << "The size of datum_vector must be no greater than transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); + for (int item_id = 0; item_id < datum_num; ++item_id) { + int offset = transformed_blob->offset(item_id); + uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); + Transform(datum_vector[item_id], &uni_blob); + } } template void DataTransformer::Transform(const vector & mat_vector, - Blob* transformed_blob) { - const int mat_num = mat_vector.size(); - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - - CHECK_GT(mat_num, 0) << "There is no MAT to add"; - CHECK_EQ(mat_num, num) << - "The size of mat_vector must be equals to transformed_blob->num()"; - Blob < Dtype > uni_blob(1, channels, height, width); - for (int item_id = 0; item_id < mat_num; ++item_id) { - int offset = transformed_blob->offset(item_id); - uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); - Transform(mat_vector[item_id], &uni_blob); - } + Blob* transformed_blob) { + const int mat_num = mat_vector.size(); + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + + CHECK_GT(mat_num, 0) << "There is no MAT to add"; + CHECK_EQ(mat_num, num) + << "The size of mat_vector must be equals to transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); + for (int item_id = 0; item_id < mat_num; ++item_id) { + int offset = transformed_blob->offset(item_id); + uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); + Transform(mat_vector[item_id], &uni_blob); + } } template void DataTransformer::Transform(const cv::Mat& cv_img, - Blob* transformed_blob) { - const int crop_size = param_.crop_size(); - const int img_channels = cv_img.channels(); - const int img_height = cv_img.rows; - const int img_width = cv_img.cols; - - // Check dimensions. - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int num = transformed_blob->num(); - - CHECK_EQ(channels, img_channels); - CHECK_LE(height, img_height); - CHECK_LE(width, img_width); - CHECK_GE(num, 1); - - CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; - - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_mean_values = mean_values_.size() > 0; - - CHECK_GT(img_channels, 0); - CHECK_GE(img_height, crop_size); - CHECK_GE(img_width, crop_size); - - Dtype* mean = NULL; - if (has_mean_file) { - CHECK_EQ(img_channels, data_mean_.channels()); - CHECK_EQ(img_height, data_mean_.height()); - CHECK_EQ(img_width, data_mean_.width()); - mean = data_mean_.mutable_cpu_data(); - } - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) << - "Specify either 1 mean_value or as many as channels: " << img_channels; - if (img_channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < img_channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } - - int h_off = 0; - int w_off = 0; - cv::Mat cv_cropped_img = cv_img; - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(img_height - crop_size + 1); - w_off = Rand(img_width - crop_size + 1); - } else { - h_off = (img_height - crop_size) / 2; - w_off = (img_width - crop_size) / 2; - } - cv::Rect roi(w_off, h_off, crop_size, crop_size); - cv_cropped_img = cv_img(roi); - } else { - CHECK_EQ(img_height, height); - CHECK_EQ(img_width, width); - } - - CHECK(cv_cropped_img.data); - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - int top_index; - for (int h = 0; h < height; ++h) { - const uchar* ptr = cv_cropped_img.ptr < uchar > (h); - int img_index = 0; - for (int w = 0; w < width; ++w) { - for (int c = 0; c < img_channels; ++c) { - if (do_mirror) { - top_index = (c * height + h) * width + (width - 1 - w); - } else { - top_index = (c * height + h) * width + w; - } - // int top_index = (c * height + h) * width + w; - Dtype pixel = static_cast(ptr[img_index++]); - if (has_mean_file) { - int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; - transformed_data[top_index] = - (pixel - mean[mean_index]) * scale; - } else { - if (has_mean_values) { - transformed_data[top_index] = - (pixel - mean_values_[c]) * scale; - } else { - transformed_data[top_index] = pixel * scale; - } - } - } - } - } + Blob* transformed_blob) { + const int crop_size = param_.crop_size(); + const int img_channels = cv_img.channels(); + const int img_height = cv_img.rows; + const int img_width = cv_img.cols; + + // Check dimensions. + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int num = transformed_blob->num(); + + CHECK_EQ(channels, img_channels); + CHECK_LE(height, img_height); + CHECK_LE(width, img_width); + CHECK_GE(num, 1); + + CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; + + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_mean_values = mean_values_.size() > 0; + + CHECK_GT(img_channels, 0); + CHECK_GE(img_height, crop_size); + CHECK_GE(img_width, crop_size); + + Dtype* mean = NULL; + if (has_mean_file) { + CHECK_EQ(img_channels, data_mean_.channels()); + CHECK_EQ(img_height, data_mean_.height()); + CHECK_EQ(img_width, data_mean_.width()); + mean = data_mean_.mutable_cpu_data(); + } + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) + << "Specify either 1 mean_value or as many as channels: " + << img_channels; + if (img_channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < img_channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } + + int h_off = 0; + int w_off = 0; + cv::Mat cv_cropped_img = cv_img; + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(img_height - crop_size + 1); + w_off = Rand(img_width - crop_size + 1); + } else { + h_off = (img_height - crop_size) / 2; + w_off = (img_width - crop_size) / 2; + } + cv::Rect roi(w_off, h_off, crop_size, crop_size); + cv_cropped_img = cv_img(roi); + } else { + CHECK_EQ(img_height, height); + CHECK_EQ(img_width, width); + } + + CHECK(cv_cropped_img.data); + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + int top_index; + for (int h = 0; h < height; ++h) { + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < width; ++w) { + for (int c = 0; c < img_channels; ++c) { + if (do_mirror) { + top_index = (c * height + h) * width + (width - 1 - w); + } else { + top_index = (c * height + h) * width + w; + } + // int top_index = (c * height + h) * width + w; + Dtype pixel = static_cast(ptr[img_index++]); + if (has_mean_file) { + int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; + transformed_data[top_index] = (pixel - mean[mean_index]) * scale; + } else { + if (has_mean_values) { + transformed_data[top_index] = (pixel - mean_values_[c]) * scale; + } else { + transformed_data[top_index] = pixel * scale; + } + } + } + } + } } template void DataTransformer::Transform(Blob* input_blob, - Blob* transformed_blob) { - const int crop_size = param_.crop_size(); - const int input_num = input_blob->num(); - const int input_channels = input_blob->channels(); - const int input_height = input_blob->height(); - const int input_width = input_blob->width(); - - if (transformed_blob->count() == 0) { - // Initialize transformed_blob with the right shape. - if (crop_size) { - transformed_blob->Reshape(input_num, input_channels, - crop_size, crop_size); - } else { - transformed_blob->Reshape(input_num, input_channels, - input_height, input_width); - } - } - - const int num = transformed_blob->num(); - const int channels = transformed_blob->channels(); - const int height = transformed_blob->height(); - const int width = transformed_blob->width(); - const int size = transformed_blob->count(); - - CHECK_LE(input_num, num); - CHECK_EQ(input_channels, channels); - CHECK_GE(input_height, height); - CHECK_GE(input_width, width); - - const Dtype scale = param_.scale(); - const bool do_mirror = param_.mirror() && Rand(2); - const bool has_mean_file = param_.has_mean_file(); - const bool has_mean_values = mean_values_.size() > 0; - - int h_off = 0; - int w_off = 0; - if (crop_size) { - CHECK_EQ(crop_size, height); - CHECK_EQ(crop_size, width); - // We only do random crop when we do training. - if (phase_ == TRAIN) { - h_off = Rand(input_height - crop_size + 1); - w_off = Rand(input_width - crop_size + 1); - } else { - h_off = (input_height - crop_size) / 2; - w_off = (input_width - crop_size) / 2; - } - } else { - CHECK_EQ(input_height, height); - CHECK_EQ(input_width, width); - } - - Dtype* input_data = input_blob->mutable_cpu_data(); - if (has_mean_file) { - CHECK_EQ(input_channels, data_mean_.channels()); - CHECK_EQ(input_height, data_mean_.height()); - CHECK_EQ(input_width, data_mean_.width()); - for (int n = 0; n < input_num; ++n) { - int offset = input_blob->offset(n); - caffe_sub(data_mean_.count(), input_data + offset, - data_mean_.cpu_data(), input_data + offset); - } - } - - if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) << - "Specify either 1 mean_value or as many as channels: " - << input_channels; - if (mean_values_.size() == 1) { - caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); - } else { - for (int n = 0; n < input_num; ++n) { - for (int c = 0; c < input_channels; ++c) { - int offset = input_blob->offset(n, c); - caffe_add_scalar(input_height * input_width, -(mean_values_[c]), - input_data + offset); - } - } - } - } - - Dtype* transformed_data = transformed_blob->mutable_cpu_data(); - - for (int n = 0; n < input_num; ++n) { - int top_index_n = n * channels; - int data_index_n = n * channels; - for (int c = 0; c < channels; ++c) { - int top_index_c = (top_index_n + c) * height; - int data_index_c = (data_index_n + c) * input_height + h_off; - for (int h = 0; h < height; ++h) { - int top_index_h = (top_index_c + h) * width; - int data_index_h = (data_index_c + h) * input_width + w_off; - if (do_mirror) { - int top_index_w = top_index_h + width - 1; - for (int w = 0; w < width; ++w) { - transformed_data[top_index_w - w] = input_data[data_index_h + w]; - } - } else { - for (int w = 0; w < width; ++w) { - transformed_data[top_index_h + w] = input_data[data_index_h + w]; - } - } - } - } - } - if (scale != Dtype(1)) { - DLOG(INFO) << "Scale: " << scale; - caffe_scal(size, scale, transformed_data); - } + Blob* transformed_blob) { + const int crop_size = param_.crop_size(); + const int input_num = input_blob->num(); + const int input_channels = input_blob->channels(); + const int input_height = input_blob->height(); + const int input_width = input_blob->width(); + + if (transformed_blob->count() == 0) { + // Initialize transformed_blob with the right shape. + if (crop_size) { + transformed_blob->Reshape(input_num, input_channels, crop_size, + crop_size); + } else { + transformed_blob->Reshape(input_num, input_channels, input_height, + input_width); + } + } + + const int num = transformed_blob->num(); + const int channels = transformed_blob->channels(); + const int height = transformed_blob->height(); + const int width = transformed_blob->width(); + const int size = transformed_blob->count(); + + CHECK_LE(input_num, num); + CHECK_EQ(input_channels, channels); + CHECK_GE(input_height, height); + CHECK_GE(input_width, width); + + const Dtype scale = param_.scale(); + const bool do_mirror = param_.mirror() && Rand(2); + const bool has_mean_file = param_.has_mean_file(); + const bool has_mean_values = mean_values_.size() > 0; + + int h_off = 0; + int w_off = 0; + if (crop_size) { + CHECK_EQ(crop_size, height); + CHECK_EQ(crop_size, width); + // We only do random crop when we do training. + if (phase_ == TRAIN) { + h_off = Rand(input_height - crop_size + 1); + w_off = Rand(input_width - crop_size + 1); + } else { + h_off = (input_height - crop_size) / 2; + w_off = (input_width - crop_size) / 2; + } + } else { + CHECK_EQ(input_height, height); + CHECK_EQ(input_width, width); + } + + Dtype* input_data = input_blob->mutable_cpu_data(); + if (has_mean_file) { + CHECK_EQ(input_channels, data_mean_.channels()); + CHECK_EQ(input_height, data_mean_.height()); + CHECK_EQ(input_width, data_mean_.width()); + for (int n = 0; n < input_num; ++n) { + int offset = input_blob->offset(n); + caffe_sub(data_mean_.count(), input_data + offset, data_mean_.cpu_data(), + input_data + offset); + } + } + + if (has_mean_values) { + CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) + << "Specify either 1 mean_value or as many as channels: " + << input_channels; + if (mean_values_.size() == 1) { + caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); + } else { + for (int n = 0; n < input_num; ++n) { + for (int c = 0; c < input_channels; ++c) { + int offset = input_blob->offset(n, c); + caffe_add_scalar(input_height * input_width, -(mean_values_[c]), + input_data + offset); + } + } + } + } + + Dtype* transformed_data = transformed_blob->mutable_cpu_data(); + + for (int n = 0; n < input_num; ++n) { + int top_index_n = n * channels; + int data_index_n = n * channels; + for (int c = 0; c < channels; ++c) { + int top_index_c = (top_index_n + c) * height; + int data_index_c = (data_index_n + c) * input_height + h_off; + for (int h = 0; h < height; ++h) { + int top_index_h = (top_index_c + h) * width; + int data_index_h = (data_index_c + h) * input_width + w_off; + if (do_mirror) { + int top_index_w = top_index_h + width - 1; + for (int w = 0; w < width; ++w) { + transformed_data[top_index_w - w] = input_data[data_index_h + w]; + } + } else { + for (int w = 0; w < width; ++w) { + transformed_data[top_index_h + w] = input_data[data_index_h + w]; + } + } + } + } + } + if (scale != Dtype(1)) { + DLOG(INFO) << "Scale: " << scale; + caffe_scal(size, scale, transformed_data); + } } template vector DataTransformer::InferBlobShape(const Datum& datum) { - if (datum.encoded()) { - CHECK(!(param_.force_color() && param_.force_gray())) - << "cannot set both force_color and force_gray"; - cv::Mat cv_img; - if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. - cv_img = DecodeDatumToCVMat(datum, param_.force_color()); - } else { - cv_img = DecodeDatumToCVMatNative(datum); - } - // InferBlobShape using the cv::image. - return InferBlobShape(cv_img); - } - - const int crop_size = param_.crop_size(); - const int datum_channels = datum.channels(); - const int datum_height = datum.height(); - const int datum_width = datum.width(); - // Check dimensions. - CHECK_GT(datum_channels, 0); - CHECK_GE(datum_height, crop_size); - CHECK_GE(datum_width, crop_size); - // Build BlobShape. - vector shape(4); - shape[0] = 1; - shape[1] = datum_channels; - shape[2] = (crop_size) ? crop_size : datum_height; - shape[3] = (crop_size) ? crop_size : datum_width; - return shape; + if (datum.encoded()) { + CHECK(!(param_.force_color() && param_.force_gray())) + << "cannot set both force_color and force_gray"; + cv::Mat cv_img; + if (param_.force_color() || param_.force_gray()) { + // If force_color then decode in color otherwise decode in gray. + cv_img = DecodeDatumToCVMat(datum, param_.force_color()); + } else { + cv_img = DecodeDatumToCVMatNative(datum); + } + // InferBlobShape using the cv::image. + return InferBlobShape(cv_img); + } + + const int crop_size = param_.crop_size(); + const int datum_channels = datum.channels(); + const int datum_height = datum.height(); + const int datum_width = datum.width(); + // Check dimensions. + CHECK_GT(datum_channels, 0); + CHECK_GE(datum_height, crop_size); + CHECK_GE(datum_width, crop_size); + // Build BlobShape. + vector shape(4); + shape[0] = 1; + shape[1] = datum_channels; + shape[2] = (crop_size) ? crop_size : datum_height; + shape[3] = (crop_size) ? crop_size : datum_width; + return shape; } template vector DataTransformer::InferBlobShape( - const vector & datum_vector) { - const int num = datum_vector.size(); - CHECK_GT(num, 0) << "There is no datum to in the vector"; - // Use first datum in the vector to InferBlobShape. - vector shape = InferBlobShape(datum_vector[0]); - // Adjust num to the size of the vector. - shape[0] = num; - return shape; + const vector & datum_vector) { + const int num = datum_vector.size(); + CHECK_GT(num, 0) << "There is no datum to in the vector"; + // Use first datum in the vector to InferBlobShape. + vector shape = InferBlobShape(datum_vector[0]); + // Adjust num to the size of the vector. + shape[0] = num; + return shape; } template vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { - const int crop_size = param_.crop_size(); - const int img_channels = cv_img.channels(); - const int img_height = cv_img.rows; - const int img_width = cv_img.cols; - // Check dimensions. - CHECK_GT(img_channels, 0); - CHECK_GE(img_height, crop_size); - CHECK_GE(img_width, crop_size); - // Build BlobShape. - vector shape(4); - shape[0] = 1; - shape[1] = img_channels; - shape[2] = (crop_size) ? crop_size : img_height; - shape[3] = (crop_size) ? crop_size : img_width; - return shape; + const int crop_size = param_.crop_size(); + const int img_channels = cv_img.channels(); + const int img_height = cv_img.rows; + const int img_width = cv_img.cols; + // Check dimensions. + CHECK_GT(img_channels, 0); + CHECK_GE(img_height, crop_size); + CHECK_GE(img_width, crop_size); + // Build BlobShape. + vector shape(4); + shape[0] = 1; + shape[1] = img_channels; + shape[2] = (crop_size) ? crop_size : img_height; + shape[3] = (crop_size) ? crop_size : img_width; + return shape; } template vector DataTransformer::InferBlobShape( - const vector & mat_vector) { - const int num = mat_vector.size(); - CHECK_GT(num, 0) << "There is no cv_img to in the vector"; - // Use first cv_img in the vector to InferBlobShape. - vector shape = InferBlobShape(mat_vector[0]); - // Adjust num to the size of the vector. - shape[0] = num; - return shape; + const vector & mat_vector) { + const int num = mat_vector.size(); + CHECK_GT(num, 0) << "There is no cv_img to in the vector"; + // Use first cv_img in the vector to InferBlobShape. + vector shape = InferBlobShape(mat_vector[0]); + // Adjust num to the size of the vector. + shape[0] = num; + return shape; } template void DataTransformer::InitRand() { - const bool needs_rand = param_.mirror() || - (phase_ == TRAIN && param_.crop_size()); - if (needs_rand) { - const unsigned int rng_seed = caffe_rng_rand(); - rng_.reset(new Caffe::RNG(rng_seed)); - } else { - rng_.reset(); - } + const bool needs_rand = param_.mirror() + || (phase_ == TRAIN && param_.crop_size()); + if (needs_rand) { + const unsigned int rng_seed = caffe_rng_rand(); + rng_.reset(new Caffe::RNG(rng_seed)); + } else { + rng_.reset(); + } } template int DataTransformer::Rand(int n) { - CHECK (rng_); - CHECK_GT(n, 0); - caffe::rng_t* rng = - static_cast(rng_->generator()); - return ((*rng)() % n); + CHECK (rng_); + CHECK_GT(n, 0); + caffe::rng_t* rng = static_cast(rng_->generator()); + return ((*rng)() % n); } INSTANTIATE_CLASS (DataTransformer); diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index 9e53a66a..bb8f9cb6 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -38,414 +38,386 @@ std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; Device::~Device() { - ReleaseKernels(); - free((void*) platformIDs); - free (DeviceIDs); - clReleaseProgram (Program); - clReleaseCommandQueue (CommandQueue); - clReleaseCommandQueue (CommandQueue_helper); - clReleaseContext (Context); - LOG(INFO) << "device destructor"; + ReleaseKernels(); + free((void*) platformIDs); + free (DeviceIDs); + clReleaseProgram (Program); + clReleaseCommandQueue (CommandQueue); + clReleaseCommandQueue (CommandQueue_helper); + clReleaseContext (Context); + LOG(INFO) << "device destructor"; } cl_int Device::Init(int deviceId) { - DisplayPlatformInfo(); - - clGetPlatformIDs(0, NULL, &numPlatforms); - cl_platform_id PlatformIDs[numPlatforms]; - clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); - - size_t nameLen; - cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, - platformName, &nameLen); - if (res != CL_SUCCESS) { - fprintf(stderr, "Err: Failed to Get Platform Info\n"); - return 0; - } - platformName[nameLen] = 0; - - GetDeviceInfo(); - cl_uint uiNumDevices; - cl_bool unified_memory = false; - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - uiNumDevices = numDevices; - if (0 == uiNumDevices) { - LOG(FATAL) << "Err: No GPU devices"; - } else { - pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id)); - OCL_CHECK( - clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, - pDevices, - &uiNumDevices)); - if (deviceId == -1) { - int i; - for (i = 0; i < (int) uiNumDevices; i++) { - clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, - sizeof(cl_bool), &unified_memory, NULL); - if (!unified_memory) { //skip iGPU - //we pick the first dGPU we found - pDevices[0] = pDevices[i]; - device_id = i; - LOG(INFO) << "Picked default device type : dGPU " << device_id; - break; - } - } - if (i == uiNumDevices) { - LOG(FATAL) << "Cannot find any dGPU! "; - } - } else if (deviceId >= 0 && deviceId < uiNumDevices) { - pDevices[0] = pDevices[deviceId]; - device_id = deviceId; - LOG(INFO) << "Picked device type : GPU " << device_id; - } else { - LOG(FATAL) << " Invalid GPU deviceId! "; - } - } - - Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); - if (NULL == Context) { - fprintf(stderr, "Err: Failed to Create Context\n"); - return 0; - } - CommandQueue = clCreateCommandQueue(Context, pDevices[0], - CL_QUEUE_PROFILING_ENABLE, NULL); - CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], - CL_QUEUE_PROFILING_ENABLE, NULL); - if (NULL == CommandQueue || NULL == CommandQueue_helper) { - fprintf(stderr, "Err: Failed to Create Commandqueue\n"); - return 0; - } - BuildProgram (oclKernelPath); - row = clblasRowMajor; - col = clblasColumnMajor; - return 0; + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, + platformName, &nameLen); + if (res != CL_SUCCESS) { + fprintf(stderr, "Err: Failed to Get Platform Info\n"); + return 0; + } + platformName[nameLen] = 0; + + GetDeviceInfo(); + cl_uint uiNumDevices; + cl_bool unified_memory = false; + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if (0 == uiNumDevices) { + LOG(FATAL) << "Err: No GPU devices"; + } else { + pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id)); + OCL_CHECK( + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, + pDevices, &uiNumDevices)); + if (deviceId == -1) { + int i; + for (i = 0; i < (int) uiNumDevices; i++) { + clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(cl_bool), &unified_memory, NULL); + if (!unified_memory) { //skip iGPU + //we pick the first dGPU we found + pDevices[0] = pDevices[i]; + device_id = i; + LOG(INFO) << "Picked default device type : dGPU " << device_id; + break; + } + } + if (i == uiNumDevices) { + LOG(FATAL) << "Cannot find any dGPU! "; + } + } else if (deviceId >= 0 && deviceId < uiNumDevices) { + pDevices[0] = pDevices[deviceId]; + device_id = deviceId; + LOG(INFO) << "Picked device type : GPU " << device_id; + } else { + LOG(FATAL) << " Invalid GPU deviceId! "; + } + } + + Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); + if (NULL == Context) { + fprintf(stderr, "Err: Failed to Create Context\n"); + return 0; + } + CommandQueue = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + if (NULL == CommandQueue || NULL == CommandQueue_helper) { + fprintf(stderr, "Err: Failed to Create Commandqueue\n"); + return 0; + } + BuildProgram (oclKernelPath); + row = clblasRowMajor; + col = clblasColumnMajor; + return 0; } -void Device::BuildProgram(std::string kernel_dir) - { - std::string strSource = ""; - DIR *ocl_dir; - struct dirent *dirp; - if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) - { - fprintf(stderr, "Err: Open ocl dir failed!\n"); - } - while ((dirp = readdir(ocl_dir)) != NULL) - { - //Ignore hidden files - if (dirp->d_name[0] == '.') - continue; - std::string file_name = std::string(dirp->d_name); - //Skip non *.cl files - size_t last_dot_pos = file_name.find_last_of("."); - if (file_name.substr(last_dot_pos + 1) != "cl") - continue; - - std::string ocl_kernel_full_path = kernel_dir + file_name; - std::string tmpSource = ""; - ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); - strSource += tmpSource; - } - const char *pSource; - pSource = strSource.c_str(); - size_t uiArrSourceSize[] = { 0 }; - uiArrSourceSize[0] = strlen(pSource); - Program = NULL; - Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, - NULL); - if (NULL == Program) { - fprintf(stderr, "Err: Failed to create program\n"); - } - cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), - NULL, NULL); - LOG(INFO) << "Build Program"; - if (CL_SUCCESS != iStatus) { - fprintf(stderr, "Err: Failed to build program\n"); - char szBuildLog[16384]; - clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, - sizeof(szBuildLog), szBuildLog, NULL); - std::cout << szBuildLog; - clReleaseProgram (Program); - } +void Device::BuildProgram(std::string kernel_dir) { + std::string strSource = ""; + DIR *ocl_dir; + struct dirent *dirp; + if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) { + fprintf(stderr, "Err: Open ocl dir failed!\n"); + } + while ((dirp = readdir(ocl_dir)) != NULL) { + //Ignore hidden files + if (dirp->d_name[0] == '.') + continue; + std::string file_name = std::string(dirp->d_name); + //Skip non *.cl files + size_t last_dot_pos = file_name.find_last_of("."); + if (file_name.substr(last_dot_pos + 1) != "cl") + continue; + + std::string ocl_kernel_full_path = kernel_dir + file_name; + std::string tmpSource = ""; + ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); + strSource += tmpSource; + } + const char *pSource; + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = { 0 }; + uiArrSourceSize[0] = strlen(pSource); + Program = NULL; + Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, + NULL); + if (NULL == Program) { + fprintf(stderr, "Err: Failed to create program\n"); + } + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), + NULL, NULL); + LOG(INFO) << "Build Program"; + if (CL_SUCCESS != iStatus) { + fprintf(stderr, "Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, + sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram (Program); + } } //Use to read OpenCL source code cl_int Device::ConvertToString(std::string pFileName, std::string &Str) { - size_t uiSize = 0; - size_t uiFileSize = 0; - char *pStr = NULL; - char *tmp = (char*) pFileName.data(); - std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary)); - if (fFile.is_open()) { - fFile.seekg(0, std::fstream::end); - uiSize = uiFileSize = (size_t) fFile.tellg(); - fFile.seekg(0, std::fstream::beg); - pStr = new char[uiSize + 1]; - - if (NULL == pStr) { - fFile.close(); - return 0; - } - fFile.read(pStr, uiFileSize); - fFile.close(); - pStr[uiSize] = '\0'; - Str = pStr; - delete[] pStr; - return 0; - } - LOG(ERROR) << "Err: Failed to open cl file!"; - return -1; + size_t uiSize = 0; + size_t uiFileSize = 0; + char *pStr = NULL; + char *tmp = (char*) pFileName.data(); + std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary)); + if (fFile.is_open()) { + fFile.seekg(0, std::fstream::end); + uiSize = uiFileSize = (size_t) fFile.tellg(); + fFile.seekg(0, std::fstream::beg); + pStr = new char[uiSize + 1]; + + if (NULL == pStr) { + fFile.close(); + return 0; + } + fFile.read(pStr, uiFileSize); + fFile.close(); + pStr[uiSize] = '\0'; + Str = pStr; + delete[] pStr; + return 0; + } + LOG(ERROR) << "Err: Failed to open cl file!"; + return -1; } -cl_kernel Device::GetKernel(std::string kernel_name) - { - std::map::iterator it = Kernels.find(kernel_name); - if (it == Kernels.end()) - { - cl_int _err = 0; - cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err); - OCL_CHECK(_err); - Kernels[kernel_name] = kernel; - } - return Kernels[kernel_name]; +cl_kernel Device::GetKernel(std::string kernel_name) { + std::map::iterator it = Kernels.find(kernel_name); + if (it == Kernels.end()) { + cl_int _err = 0; + cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err); + OCL_CHECK(_err); + Kernels[kernel_name] = kernel; + } + return Kernels[kernel_name]; } -void Device::ReleaseKernels() -{ - std::map::iterator it; - for (it = Kernels.begin(); it != Kernels.end(); it++) - { - clReleaseKernel(it->second); - } +void Device::ReleaseKernels() { + std::map::iterator it; + for (it = Kernels.begin(); it != Kernels.end(); it++) { + clReleaseKernel(it->second); + } } void Device::DisplayPlatformInfo() { - cl_int err; - - err = clGetPlatformIDs(0, NULL, &numPlatforms); - if (err != CL_SUCCESS || numPlatforms <= 0) - { - LOG(ERROR) << "Failed to find any OpenCL platform."; - return; - } - - platformIDs = (cl_platform_id *) malloc( - sizeof(cl_platform_id) * numPlatforms); - err = clGetPlatformIDs(numPlatforms, platformIDs, NULL); - if (err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find any OpenCL platform."; - return; - } - - LOG(INFO) << "Number of platforms found:" << numPlatforms; - - //iterate through the list of platforms displaying platform information - for (cl_uint i = 0; i < numPlatforms; i++) { - DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); - DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); - DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); - DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, - "CL_PLATFORM_EXTENSIONS"); - } + cl_int err; + + err = clGetPlatformIDs(0, NULL, &numPlatforms); + if (err != CL_SUCCESS || numPlatforms <= 0) { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + platformIDs = (cl_platform_id *) malloc( + sizeof(cl_platform_id) * numPlatforms); + err = clGetPlatformIDs(numPlatforms, platformIDs, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + LOG(INFO) << "Number of platforms found:" << numPlatforms; + + //iterate through the list of platforms displaying platform information + for (cl_uint i = 0; i < numPlatforms; i++) { + DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); + DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); + DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, + "CL_PLATFORM_EXTENSIONS"); + } } void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, - std::string str) { - cl_int err; - std::size_t paramValueSize; - - err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); - if (err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL platform:" << str; - return; - } - - char * info = (char *) alloca(sizeof(char) * paramValueSize); - err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); - if (err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL platform:" << str; - return; - } - - LOG(INFO) << "\t" << str << "\t" << info; + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + char * info = (char *) alloca(sizeof(char) * paramValueSize); + err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + LOG(INFO) << "\t" << str << "\t" << info; } void Device::GetDeviceInfo() { - cl_int err; - //by default, we select the first platform. can be extended for more platforms - //query GPU device for now - err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, - &numDevices); - // we allow program run if no GPU is found. Just return. No error reported. - if (numDevices < 1) - { - LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; - LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; - return; - } - - DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices); - err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, - DeviceIDs, NULL); - if (err != CL_SUCCESS) - { - LOG(INFO) << "Failed to find any GPU devices."; - return; - } - - LOG(INFO) << "Number of devices found:" << numDevices; - for (cl_uint i = 0; i < numDevices; i++) { - LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i]; - DisplayDeviceInfo < cl_device_type - > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); - DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); - DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); - DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); - DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); - DisplayDeviceInfo < cl_bool - > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); - DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); - DisplayDeviceInfo < size_t - > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); - DisplayDeviceInfo < cl_uint - > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); - DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, - "Max work item sizes"); - DisplayDeviceInfo < cl_command_queue_properties - > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); - DisplayDeviceInfo < cl_device_exec_capabilities - > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); - DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); - DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); - DisplayDeviceInfo < cl_ulong - > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); - } + cl_int err; + //by default, we select the first platform. can be extended for more platforms + //query GPU device for now + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, + &numDevices); + // we allow program run if no GPU is found. Just return. No error reported. + if (numDevices < 1) { + LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; + LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; + return; + } + + DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices); + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, + DeviceIDs, NULL); + if (err != CL_SUCCESS) { + LOG(INFO) << "Failed to find any GPU devices."; + return; + } + + LOG(INFO) << "Number of devices found:" << numDevices; + for (cl_uint i = 0; i < numDevices; i++) { + LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i]; + DisplayDeviceInfo < cl_device_type + > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + DisplayDeviceInfo < size_t + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, + "Max work item sizes"); + DisplayDeviceInfo < cl_command_queue_properties + > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + DisplayDeviceInfo < cl_device_exec_capabilities + > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + } } -void Device::DeviceQuery() -{ - DisplayPlatformInfo(); +void Device::DeviceQuery() { + DisplayPlatformInfo(); - clGetPlatformIDs(0, NULL, &numPlatforms); - cl_platform_id PlatformIDs[numPlatforms]; - clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); - size_t nameLen; - cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, - platformName, &nameLen); - if (res != CL_SUCCESS) { - fprintf(stderr, "Err: Failed to Get Platform Info\n"); - return; - } - platformName[nameLen] = 0; + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, + platformName, &nameLen); + if (res != CL_SUCCESS) { + fprintf(stderr, "Err: Failed to Get Platform Info\n"); + return; + } + platformName[nameLen] = 0; - GetDeviceInfo(); + GetDeviceInfo(); } template void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, - std::string str) { - cl_int err; - std::size_t paramValueSize; - - err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); - if (err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL device info:" << str; - return; - } - - std::string content; - T * info = (T *) alloca(sizeof(T) * paramValueSize); - err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); - if (err != CL_SUCCESS) - { - LOG(ERROR) << "Failed to find OpenCL device info:" << str; - return; - } - - switch (name) { - case CL_DEVICE_TYPE: - { - std::string deviceType; - appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); - - appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); - - appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); - - appendBitfield < cl_device_type - > ( - *(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); - - LOG(INFO) << "\t " << str << ":\t" << deviceType; - } - break; - case CL_DEVICE_EXECUTION_CAPABILITIES: - { - std::string memType; - appendBitfield < cl_device_exec_capabilities - > ( - *(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); - - appendBitfield < cl_device_exec_capabilities - > ( - *(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); - - LOG(INFO) << "\t " << str << ":\t" << memType; - - } - break; - case CL_DEVICE_QUEUE_PROPERTIES: - { - std::string memType; - appendBitfield < cl_device_exec_capabilities - > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); - - appendBitfield < cl_device_exec_capabilities - > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); - - LOG(INFO) << "\t " << str << ":\t" << memType; - } - break; - default: - LOG(INFO) << "\t" << str << ":\t" << *info; - break; - } + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + std::string content; + T * info = (T *) alloca(sizeof(T) * paramValueSize); + err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + switch (name) { + case CL_DEVICE_TYPE: { + std::string deviceType; + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); + + LOG(INFO) << "\t " << str << ":\t" << deviceType; + } + break; + case CL_DEVICE_EXECUTION_CAPABILITIES: { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); + + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + + } + break; + case CL_DEVICE_QUEUE_PROPERTIES: { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); + + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + } + break; + default: + LOG(INFO) << "\t" << str << ":\t" << *info; + break; + } } template -void Device::appendBitfield(T info, T value, std::string name, std::string &str) - { - if (info & value) - { - if (str.length() > 0) - { - str.append(" | "); - } - str.append(name); - } +void Device::appendBitfield(T info, T value, std::string name, + std::string &str) { + if (info & value) { + if (str.length() > 0) { + str.append(" | "); + } + str.append(name); + } } } // namespace caffe diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index ba302ba8..fb512847 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -4,36 +4,36 @@ namespace caffe { InternalThread::~InternalThread() { - WaitForInternalThreadToExit(); + WaitForInternalThreadToExit(); } bool InternalThread::is_started() const { - return thread_.get() != NULL && thread_->joinable(); + return thread_.get() != NULL && thread_->joinable(); } bool InternalThread::StartInternalThread() { - if (!WaitForInternalThreadToExit()) { - return false; - } - try { - thread_.reset( - new boost::thread(&InternalThread::InternalThreadEntry, this)); - } catch (...) { - return false; - } - return true; + if (!WaitForInternalThreadToExit()) { + return false; + } + try { + thread_.reset( + new boost::thread(&InternalThread::InternalThreadEntry, this)); + } catch (...) { + return false; + } + return true; } /** Will not return until the internal thread has exited. */ bool InternalThread::WaitForInternalThreadToExit() { - if (is_started()) { - try { - thread_->join(); - } catch (...) { - return false; - } - } - return true; + if (is_started()) { + try { + thread_->join(); + } catch (...) { + return false; + } + } + return true; } } // namespace caffe diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index a720ee92..44233c98 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -18,24 +18,23 @@ namespace caffe { // Get convolution layer according to engine. template -shared_ptr > GetConvolutionLayer( - const LayerParameter& param) { - ConvolutionParameter_Engine engine = param.convolution_param().engine(); - if (engine == ConvolutionParameter_Engine_DEFAULT) { - engine = ConvolutionParameter_Engine_CAFFE; +shared_ptr > GetConvolutionLayer(const LayerParameter& param) { + ConvolutionParameter_Engine engine = param.convolution_param().engine(); + if (engine == ConvolutionParameter_Engine_DEFAULT) { + engine = ConvolutionParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = ConvolutionParameter_Engine_CUDNN; + engine = ConvolutionParameter_Engine_CUDNN; #endif - } - if (engine == ConvolutionParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new ConvolutionLayer(param)); + } + if (engine == ConvolutionParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new ConvolutionLayer(param)); #ifdef USE_CUDNN - } else if (engine == ConvolutionParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNConvolutionLayer(param)); + } else if (engine == ConvolutionParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNConvolutionLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer); @@ -43,29 +42,29 @@ REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer); // Get pooling layer according to engine. template shared_ptr > GetPoolingLayer(const LayerParameter& param) { - PoolingParameter_Engine engine = param.pooling_param().engine(); - if (engine == PoolingParameter_Engine_DEFAULT) { - engine = PoolingParameter_Engine_CAFFE; + PoolingParameter_Engine engine = param.pooling_param().engine(); + if (engine == PoolingParameter_Engine_DEFAULT) { + engine = PoolingParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = PoolingParameter_Engine_CUDNN; + engine = PoolingParameter_Engine_CUDNN; #endif - } - if (engine == PoolingParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new PoolingLayer(param)); + } + if (engine == PoolingParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new PoolingLayer(param)); #ifdef USE_CUDNN - } else if (engine == PoolingParameter_Engine_CUDNN) { - PoolingParameter p_param = param.pooling_param(); - if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || - param.top_size() > 1) { - LOG(INFO) << "CUDNN does not support padding or multiple tops. " - << "Using Caffe's own pooling layer."; - return shared_ptr >(new PoolingLayer(param)); - } - return shared_ptr >(new CuDNNPoolingLayer(param)); + } else if (engine == PoolingParameter_Engine_CUDNN) { + PoolingParameter p_param = param.pooling_param(); + if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || + param.top_size() > 1) { + LOG(INFO) << "CUDNN does not support padding or multiple tops. " + << "Using Caffe's own pooling layer."; + return shared_ptr >(new PoolingLayer(param)); + } + return shared_ptr >(new CuDNNPoolingLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); @@ -73,22 +72,22 @@ REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); // Get relu layer according to engine. template shared_ptr > GetReLULayer(const LayerParameter& param) { - ReLUParameter_Engine engine = param.relu_param().engine(); - if (engine == ReLUParameter_Engine_DEFAULT) { - engine = ReLUParameter_Engine_CAFFE; + ReLUParameter_Engine engine = param.relu_param().engine(); + if (engine == ReLUParameter_Engine_DEFAULT) { + engine = ReLUParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = ReLUParameter_Engine_CUDNN; + engine = ReLUParameter_Engine_CUDNN; #endif - } - if (engine == ReLUParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new ReLULayer(param)); + } + if (engine == ReLUParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new ReLULayer(param)); #ifdef USE_CUDNN - } else if (engine == ReLUParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNReLULayer(param)); + } else if (engine == ReLUParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNReLULayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(ReLU, GetReLULayer); @@ -96,22 +95,22 @@ REGISTER_LAYER_CREATOR(ReLU, GetReLULayer); // Get sigmoid layer according to engine. template shared_ptr > GetSigmoidLayer(const LayerParameter& param) { - SigmoidParameter_Engine engine = param.sigmoid_param().engine(); - if (engine == SigmoidParameter_Engine_DEFAULT) { - engine = SigmoidParameter_Engine_CAFFE; + SigmoidParameter_Engine engine = param.sigmoid_param().engine(); + if (engine == SigmoidParameter_Engine_DEFAULT) { + engine = SigmoidParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = SigmoidParameter_Engine_CUDNN; + engine = SigmoidParameter_Engine_CUDNN; #endif - } - if (engine == SigmoidParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new SigmoidLayer(param)); + } + if (engine == SigmoidParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new SigmoidLayer(param)); #ifdef USE_CUDNN - } else if (engine == SigmoidParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNSigmoidLayer(param)); + } else if (engine == SigmoidParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNSigmoidLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); @@ -119,22 +118,22 @@ REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); // Get softmax layer according to engine. template shared_ptr > GetSoftmaxLayer(const LayerParameter& param) { - SoftmaxParameter_Engine engine = param.softmax_param().engine(); - if (engine == SoftmaxParameter_Engine_DEFAULT) { - engine = SoftmaxParameter_Engine_CAFFE; + SoftmaxParameter_Engine engine = param.softmax_param().engine(); + if (engine == SoftmaxParameter_Engine_DEFAULT) { + engine = SoftmaxParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = SoftmaxParameter_Engine_CUDNN; + engine = SoftmaxParameter_Engine_CUDNN; #endif - } - if (engine == SoftmaxParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new SoftmaxLayer(param)); + } + if (engine == SoftmaxParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new SoftmaxLayer(param)); #ifdef USE_CUDNN - } else if (engine == SoftmaxParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNSoftmaxLayer(param)); + } else if (engine == SoftmaxParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNSoftmaxLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer); @@ -142,22 +141,22 @@ REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer); // Get tanh layer according to engine. template shared_ptr > GetTanHLayer(const LayerParameter& param) { - TanHParameter_Engine engine = param.tanh_param().engine(); - if (engine == TanHParameter_Engine_DEFAULT) { - engine = TanHParameter_Engine_CAFFE; + TanHParameter_Engine engine = param.tanh_param().engine(); + if (engine == TanHParameter_Engine_DEFAULT) { + engine = TanHParameter_Engine_CAFFE; #ifdef USE_CUDNN - engine = TanHParameter_Engine_CUDNN; + engine = TanHParameter_Engine_CUDNN; #endif - } - if (engine == TanHParameter_Engine_CAFFE) { - return shared_ptr < Layer > (new TanHLayer(param)); + } + if (engine == TanHParameter_Engine_CAFFE) { + return shared_ptr < Layer > (new TanHLayer(param)); #ifdef USE_CUDNN - } else if (engine == TanHParameter_Engine_CUDNN) { - return shared_ptr >(new CuDNNTanHLayer(param)); + } else if (engine == TanHParameter_Engine_CUDNN) { + return shared_ptr >(new CuDNNTanHLayer(param)); #endif - } else { - LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; - } + } else { + LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; + } } REGISTER_LAYER_CREATOR(TanH, GetTanHLayer); @@ -165,15 +164,15 @@ REGISTER_LAYER_CREATOR(TanH, GetTanHLayer); #ifdef WITH_PYTHON_LAYER template shared_ptr > GetPythonLayer(const LayerParameter& param) { - Py_Initialize(); - try { - bp::object module = bp::import(param.python_param().module().c_str()); - bp::object layer = module.attr(param.python_param().layer().c_str())(param); - return bp::extract > >(layer)(); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; - } + Py_Initialize(); + try { + bp::object module = bp::import(param.python_param().module().c_str()); + bp::object layer = module.attr(param.python_param().layer().c_str())(param); + return bp::extract > >(layer)(); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } REGISTER_LAYER_CREATOR(Python, GetPythonLayer); diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 5dc99b75..945162af 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -8,53 +8,53 @@ namespace caffe { template void AbsValLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " + "allow in-place computation."; } template -void AbsValLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_cpu_data(); - caffe_abs(count, bottom[0]->cpu_data(), top_data); +void AbsValLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_cpu_data(); + caffe_abs(count, bottom[0]->cpu_data(), top_data); } template void AbsValLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_cpu_sign(count, bottom_data, bottom_diff); - caffe_mul(count, bottom_diff, top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->cpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_cpu_sign(count, bottom_data, bottom_diff); + caffe_mul(count, bottom_diff, top_diff, bottom_diff); + } } // begin: code written/modified by AMD template void AbsValLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); + const vector*>& top) { + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); } template void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_sign(count, bottom_data, bottom_diff); + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index a26839d4..4cfc96f8 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -11,78 +11,76 @@ namespace caffe { template -void AccuracyLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - top_k_ = this->layer_param_.accuracy_param().top_k(); +void AccuracyLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + top_k_ = this->layer_param_.accuracy_param().top_k(); - has_ignore_label_ = - this->layer_param_.accuracy_param().has_ignore_label(); - if (has_ignore_label_) { - ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); - } + has_ignore_label_ = this->layer_param_.accuracy_param().has_ignore_label(); + if (has_ignore_label_) { + ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); + } } template -void AccuracyLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) - << "top_k must be less than or equal to the number of classes."; - label_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); - outer_num_ = bottom[0]->count(0, label_axis_); - inner_num_ = bottom[0]->count(label_axis_ + 1); - CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; - vector top_shape(0); // Accuracy is a scalar; 0 axes. - top[0]->Reshape(top_shape); +void AccuracyLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) + << "top_k must be less than or equal to the number of classes."; + label_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.accuracy_param().axis()); + outer_num_ = bottom[0]->count(0, label_axis_); + inner_num_ = bottom[0]->count(label_axis_ + 1); + CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) + << "Number of labels must match number of predictions; " + << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; + vector top_shape(0); // Accuracy is a scalar; 0 axes. + top[0]->Reshape(top_shape); } template void AccuracyLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype accuracy = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const int dim = bottom[0]->count() / outer_num_; - const int num_labels = bottom[0]->shape(label_axis_); - vector < Dtype > maxval(top_k_ + 1); - vector max_id(top_k_ + 1); - int count = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; ++j) { - const int label_value = - static_cast(bottom_label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - continue; - } - DCHECK_GE(label_value, 0); - DCHECK_LT(label_value, num_labels); - // Top-k accuracy - std::vector < std::pair > bottom_data_vector; - for (int k = 0; k < num_labels; ++k) { - bottom_data_vector.push_back(std::make_pair( - bottom_data[i * dim + k * inner_num_ + j], k)); - } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); - // check if true label is in top k predictions - for (int k = 0; k < top_k_; k++) { - if (bottom_data_vector[k].second == label_value) { - ++accuracy; - break; - } - } - ++count; - } - } + const vector*>& top) { + Dtype accuracy = 0; + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const int dim = bottom[0]->count() / outer_num_; + const int num_labels = bottom[0]->shape(label_axis_); + vector < Dtype > maxval(top_k_ + 1); + vector max_id(top_k_ + 1); + int count = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; ++j) { + const int label_value = static_cast(bottom_label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + continue; + } + DCHECK_GE(label_value, 0); + DCHECK_LT(label_value, num_labels); + // Top-k accuracy + std::vector < std::pair > bottom_data_vector; + for (int k = 0; k < num_labels; ++k) { + bottom_data_vector.push_back( + std::make_pair(bottom_data[i * dim + k * inner_num_ + j], k)); + } + std::partial_sort(bottom_data_vector.begin(), + bottom_data_vector.begin() + top_k_, bottom_data_vector.end(), + std::greater >()); + // check if true label is in top k predictions + for (int k = 0; k < top_k_; k++) { + if (bottom_data_vector[k].second == label_value) { + ++accuracy; + break; + } + } + ++count; + } + } - // LOG(INFO) << "Accuracy: " << accuracy; - top[0]->mutable_cpu_data()[0] = accuracy / count; - // Accuracy layer should not be used as a loss function. + // LOG(INFO) << "Accuracy: " << accuracy; + top[0]->mutable_cpu_data()[0] = accuracy / count; + // Accuracy layer should not be used as a loss function. } INSTANTIATE_CLASS (AccuracyLayer); diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index 235e8371..7b37283d 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -10,51 +10,50 @@ namespace caffe { template void ArgMaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - out_max_val_ = this->layer_param_.argmax_param().out_max_val(); - top_k_ = this->layer_param_.argmax_param().top_k(); - CHECK_GE(top_k_, 1) << " top k must not be less than 1."; - CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num()) - << "top_k must be less than or equal to the number of classes."; + const vector*>& top) { + out_max_val_ = this->layer_param_.argmax_param().out_max_val(); + top_k_ = this->layer_param_.argmax_param().top_k(); + CHECK_GE(top_k_, 1) << " top k must not be less than 1."; + CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num()) + << "top_k must be less than or equal to the number of classes."; } template void ArgMaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - if (out_max_val_) { - // Produces max_ind and max_val - top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); - } else { - // Produces only max_ind - top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1); - } + const vector*>& top) { + if (out_max_val_) { + // Produces max_ind and max_val + top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); + } else { + // Produces only max_ind + top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1); + } } template void ArgMaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - for (int i = 0; i < num; ++i) { - std::vector < std::pair > bottom_data_vector; - for (int j = 0; j < dim; ++j) { - bottom_data_vector.push_back( - std::make_pair(bottom_data[i * dim + j], j)); - } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); - for (int j = 0; j < top_k_; ++j) { - top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; - } - if (out_max_val_) { - for (int j = 0; j < top_k_; ++j) { - top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first; - } - } - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + for (int i = 0; i < num; ++i) { + std::vector < std::pair > bottom_data_vector; + for (int j = 0; j < dim; ++j) { + bottom_data_vector.push_back(std::make_pair(bottom_data[i * dim + j], j)); + } + std::partial_sort(bottom_data_vector.begin(), + bottom_data_vector.begin() + top_k_, bottom_data_vector.end(), + std::greater >()); + for (int j = 0; j < top_k_; ++j) { + top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; + } + if (out_max_val_) { + for (int j = 0; j < top_k_; ++j) { + top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first; + } + } + } } INSTANTIATE_CLASS (ArgMaxLayer); diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 149b1a21..ee0df02f 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -18,33 +18,31 @@ template cl_mem BaseConvolutionLayer::transMem = clCreat template void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) { - if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) { - ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size; - clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem); - ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context, - CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, - NULL, - NULL); - } - if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) { - ConvolutionLayer < Dtype > ::trans_mem_size = trans_size; - clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem); - ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context, - CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, - NULL, - NULL); - } + if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) { + ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem); + ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, + NULL, NULL); + } + if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) { + ConvolutionLayer < Dtype > ::trans_mem_size = trans_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem); + ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, + NULL, NULL); + } } template void BaseConvolutionLayer::ocl_setup() { - M_ = num_output_ / group_; - K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; - N_ = height_out_ * width_out_; + M_ = num_output_ / group_; + K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; + N_ = height_out_ * width_out_; #ifdef use_packing_scheme - size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); - size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); - Alloc_public_tmp_mem(subtop_size, trans_size); + size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); + size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); + Alloc_public_tmp_mem(subtop_size, trans_size); #endif } @@ -54,428 +52,417 @@ BaseConvolutionLayer::~BaseConvolutionLayer() { template void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - // Configure the kernel size, padding, stride, and inputs. - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - if (conv_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = conv_param.kernel_size(); - } else { - kernel_h_ = conv_param.kernel_h(); - kernel_w_ = conv_param.kernel_w(); - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!conv_param.has_pad_h()) { - pad_h_ = pad_w_ = conv_param.pad(); - } else { - pad_h_ = conv_param.pad_h(); - pad_w_ = conv_param.pad_w(); - } - if (!conv_param.has_stride_h()) { - stride_h_ = stride_w_ = conv_param.stride(); - } else { - stride_h_ = conv_param.stride_h(); - stride_w_ = conv_param.stride_w(); - } - // Special case: im2col is the identity for 1x1 convolution with stride 1 - // and no padding, so flag for skipping the buffer and transformation. - is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 - && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; - // Configure output channels and groups. - channels_ = bottom[0]->channels(); - num_output_ = this->layer_param_.convolution_param().num_output(); - CHECK_GT(num_output_, 0); - group_ = this->layer_param_.convolution_param().group(); - CHECK_EQ(channels_ % group_, 0); - CHECK_EQ(num_output_ % group_, 0) - << "Number of output should be multiples of group."; - if (reverse_dimensions()) { - conv_out_channels_ = channels_; - conv_in_channels_ = num_output_; - } else { - conv_out_channels_ = num_output_; - conv_in_channels_ = channels_; - } - - // Handle the parameters: weights and biases. - // - blobs_[0] holds the filter weights - // - blobs_[1] holds the biases (optional) - bias_term_ = this->layer_param_.convolution_param().bias_term(); - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - if (bias_term_) { - this->blobs_.resize(2); - } else { - this->blobs_.resize(1); - } - // Initialize and fill the weights: - // output channels x input channels per-group x kernel height x kernel width - this->blobs_[0].reset(new Blob( - conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); - shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( - this->layer_param_.convolution_param().weight_filler())); - weight_filler->Fill(this->blobs_[0].get()); - // If necessary, initialize and fill the biases. - if (bias_term_) { - vector bias_shape(1, num_output_); - this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( - this->layer_param_.convolution_param().bias_filler())); - bias_filler->Fill(this->blobs_[1].get()); - } - } - // Propagate gradients to the parameters (as directed by backward pass). - this->param_propagate_down_.resize(this->blobs_.size(), true); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + // Configure the kernel size, padding, stride, and inputs. + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK( + !conv_param.has_kernel_size() + != !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK( + conv_param.has_kernel_size() + || (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK( + (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK( + (!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } + // Special case: im2col is the identity for 1x1 convolution with stride 1 + // and no padding, so flag for skipping the buffer and transformation. + is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 && stride_h_ == 1 && stride_w_ == 1 + && pad_h_ == 0 && pad_w_ == 0; + // Configure output channels and groups. + channels_ = bottom[0]->channels(); + num_output_ = this->layer_param_.convolution_param().num_output(); + CHECK_GT(num_output_, 0); + group_ = this->layer_param_.convolution_param().group(); + CHECK_EQ(channels_ % group_, 0); + CHECK_EQ(num_output_ % group_, 0) + << "Number of output should be multiples of group."; + if (reverse_dimensions()) { + conv_out_channels_ = channels_; + conv_in_channels_ = num_output_; + } else { + conv_out_channels_ = num_output_; + conv_in_channels_ = channels_; + } + + // Handle the parameters: weights and biases. + // - blobs_[0] holds the filter weights + // - blobs_[1] holds the biases (optional) + bias_term_ = this->layer_param_.convolution_param().bias_term(); + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + if (bias_term_) { + this->blobs_.resize(2); + } else { + this->blobs_.resize(1); + } + // Initialize and fill the weights: + // output channels x input channels per-group x kernel height x kernel width + this->blobs_[0].reset( + new Blob(conv_out_channels_, conv_in_channels_ / group_, + kernel_h_, kernel_w_)); + shared_ptr < Filler + > weight_filler( + GetFiller < Dtype + > (this->layer_param_.convolution_param().weight_filler())); + weight_filler->Fill(this->blobs_[0].get()); + // If necessary, initialize and fill the biases. + if (bias_term_) { + vector bias_shape(1, num_output_); + this->blobs_[1].reset(new Blob(bias_shape)); + shared_ptr < Filler + > bias_filler( + GetFiller < Dtype + > (this->layer_param_.convolution_param().bias_filler())); + bias_filler->Fill(this->blobs_[1].get()); + } + } + // Propagate gradients to the parameters (as directed by backward pass). + this->param_propagate_down_.resize(this->blobs_.size(), true); } template void BaseConvolutionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" - " convolution kernel."; - // TODO: generalize to handle inputs of different shapes. - for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { - CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; - CHECK_EQ(channels_, bottom[bottom_id]->channels()) - << "Inputs must have same channels."; - CHECK_EQ(height_, bottom[bottom_id]->height()) - << "Inputs must have same height."; - CHECK_EQ(width_, bottom[bottom_id]->width()) - << "Inputs must have same width."; - } - // Shape the tops. - compute_output_shape(); - for (int top_id = 0; top_id < top.size(); ++top_id) { - top[top_id]->Reshape(num_, num_output_, height_out_, width_out_); - } - if (reverse_dimensions()) { - conv_in_height_ = height_out_; - conv_in_width_ = width_out_; - conv_out_spatial_dim_ = height_ * width_; - } else { - conv_in_height_ = height_; - conv_in_width_ = width_; - conv_out_spatial_dim_ = height_out_ * width_out_; - } - kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_; - weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_; - col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_; - output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; - // The im2col result buffer will only hold one image at a time to avoid - // overly large memory usage. In the special case of 1x1 convolution - // it goes lazily unused to save memory. - if (reverse_dimensions()) { - col_buffer_.Reshape(1, kernel_dim_, height_, width_); - } else { - col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_); - } - // Set up the all ones "bias multiplier" for adding biases by BLAS - if (bias_term_) { - vector bias_multiplier_shape(1, height_out_ * width_out_); - bias_multiplier_.Reshape(bias_multiplier_shape); - caffe_set(bias_multiplier_.count(), Dtype(1), - bias_multiplier_.mutable_cpu_data()); - } - //initializa OpenCL kernels and cl_mem objects - ocl_setup(); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" + " convolution kernel."; + // TODO: generalize to handle inputs of different shapes. + for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { + CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; + CHECK_EQ(channels_, bottom[bottom_id]->channels()) + << "Inputs must have same channels."; + CHECK_EQ(height_, bottom[bottom_id]->height()) + << "Inputs must have same height."; + CHECK_EQ(width_, bottom[bottom_id]->width()) + << "Inputs must have same width."; + } + // Shape the tops. + compute_output_shape(); + for (int top_id = 0; top_id < top.size(); ++top_id) { + top[top_id]->Reshape(num_, num_output_, height_out_, width_out_); + } + if (reverse_dimensions()) { + conv_in_height_ = height_out_; + conv_in_width_ = width_out_; + conv_out_spatial_dim_ = height_ * width_; + } else { + conv_in_height_ = height_; + conv_in_width_ = width_; + conv_out_spatial_dim_ = height_out_ * width_out_; + } + kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_; + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_; + col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // The im2col result buffer will only hold one image at a time to avoid + // overly large memory usage. In the special case of 1x1 convolution + // it goes lazily unused to save memory. + if (reverse_dimensions()) { + col_buffer_.Reshape(1, kernel_dim_, height_, width_); + } else { + col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_); + } + // Set up the all ones "bias multiplier" for adding biases by BLAS + if (bias_term_) { + vector bias_multiplier_shape(1, height_out_ * width_out_); + bias_multiplier_.Reshape(bias_multiplier_shape); + caffe_set(bias_multiplier_.count(), Dtype(1), + bias_multiplier_.mutable_cpu_data()); + } + //initializa OpenCL kernels and cl_mem objects + ocl_setup(); } template void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); - } - col_buff = col_buffer_.cpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype) 0., output + output_offset_ * g); - } + const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); + } + col_buff = col_buffer_.cpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ + / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff + + col_offset_ * g, (Dtype) 0., output + output_offset_ * g); + } } template void BaseConvolutionLayer::forward_cpu_bias(Dtype* output, - const Dtype* bias) { - caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), - (Dtype) 1., output); + const Dtype* bias) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), (Dtype) 1., output); } template void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { - Dtype* col_buff = col_buffer_.mutable_cpu_data(); - if (is_1x1_) { - col_buff = input; - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype) 0., col_buff + col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_cpu(col_buff, input); - } + const Dtype* weights, Dtype* input) { + Dtype* col_buff = col_buffer_.mutable_cpu_data(); + if (is_1x1_) { + col_buff = input; + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ + / group_, (Dtype) 1., weights + weight_offset_ * g, output + + output_offset_ * g, (Dtype) 0., col_buff + col_offset_ * g); + } + if (!is_1x1_) { + conv_col2im_cpu(col_buff, input); + } } template void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; - if (!is_1x1_) { - conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); - col_buff = col_buffer_.cpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype) 1., weights + weight_offset_ * g); - } + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + if (!is_1x1_) { + conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); + col_buff = col_buffer_.cpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ + / group_, conv_out_spatial_dim_, (Dtype) 1., output + + output_offset_ * g, col_buff + col_offset_ * g, (Dtype) 1., weights + + weight_offset_ * g); + } } template void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias, - const Dtype* input) { - caffe_cpu_gemv < Dtype - > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.cpu_data(), 1., bias); + const Dtype* input) { + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., input, bias_multiplier_.cpu_data(), 1., bias); } #ifndef CPU_ONLY template void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, - const Dtype* weights, Dtype* output, bool skip_im2col) { - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - } - col_buff = col_buffer_.gpu_data(); - } - - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, - conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ - / group_, - (Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g, - (Dtype) 0., output, top_offset_ + output_offset_ * g); - } + const Dtype* weights, Dtype* output, bool skip_im2col) { + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + } + col_buff = col_buffer_.gpu_data(); + } + + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_ + / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_ + * g, col_buff, col_offset_ * g, (Dtype) 0., output, top_offset_ + + output_offset_ * g); + } } template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, - const Dtype* bias) { - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype) 1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype) 1., output, top_offset_); + const Dtype* bias) { + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_); } template void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, - const Dtype* weights, Dtype* input) { - Dtype* col_buff = col_buffer_.mutable_gpu_data(); - if (is_1x1_) { - col_buff = input; - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ - / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype) 1., weights, weight_offset_ * g, - output, top_offset_ + output_offset_ * g, - (Dtype) 0., col_buff, col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_gpu(col_buff, input); - } + const Dtype* weights, Dtype* input) { + Dtype* col_buff = col_buffer_.mutable_gpu_data(); + if (is_1x1_) { + col_buff = input; + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_ + * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, col_offset_ + * g); + } + if (!is_1x1_) { + conv_col2im_gpu(col_buff, input); + } } - template void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, - const Dtype* output, Dtype* weights) { - const Dtype* col_buff = input; - if (!is_1x1_) { - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); - col_buff = col_buffer_.gpu_data(); - } - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ - / group_, kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype) 1., output, top_offset_, - (Dtype*) col_buff, col_offset_ * g, (Dtype) 1., - (Dtype*) weights, weight_offset_ * g); - } + const Dtype* output, Dtype* weights) { + const Dtype* col_buff = input; + if (!is_1x1_) { + conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); + col_buff = col_buffer_.gpu_data(); + } + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_, (Dtype*) col_buff, col_offset_ + * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g); + } } template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, - const Dtype* input) { - caffe_gpu_gemv < Dtype - > (CblasNoTrans, num_output_, N_, - (Dtype) 1., input, top_offset_, N_, - reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, - bias, (size_t) 0, 1); + const Dtype* input) { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1); } // begin: code written/modified by AMD template void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, - const Dtype* weight, Dtype* output, bool skip_im2col) { - cl_command_queue Queue; - const Dtype* col_buff = input; - if (!is_1x1_) { - if (!skip_im2col) { - conv_im2col_gpu_opt(input); - } - col_buff = col_buffer_.gpu_data(); - } else { - caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, - (Dtype*) transMem); - } + const Dtype* weight, Dtype* output, bool skip_im2col) { + cl_command_queue Queue; + const Dtype* col_buff = input; + if (!is_1x1_) { + if (!skip_im2col) { + conv_im2col_gpu_opt(input); + } + col_buff = col_buffer_.gpu_data(); + } else { + caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, + (Dtype*) transMem); + } #ifdef multiQ - for (int g = 0; g < group_; ++g) { - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; - caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, - (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); - } - if(group_ == 2) { - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + for (int g = 0; g < group_; ++g) { + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); + } + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #else - Queue = amdDevice.CommandQueue; - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, - (Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_ - * g, - (Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g); - } + Queue = amdDevice.CommandQueue; + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype) 1., weight, weight_offset_ + * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 0., (Dtype*) subTopMem, top_offset_opt + * g); + } #endif - transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_, - opt_num2); + transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_, + opt_num2); } - template void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, - const Dtype* bias) { - for (int z = 0; z < opt_num2; z++) - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype) 1., bias, 0, - reinterpret_cast(bias_multiplier_.gpu_data()), 0, - (Dtype) 1., output, top_offset_ + num_output_ * N_ * z); + const Dtype* bias) { + for (int z = 0; z < opt_num2; z++) + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype) 1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_ + + num_output_ * N_ * z); } - template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, - const Dtype* weights, Dtype* input) { - cl_command_queue Queue; - if (is_1x1_) { - caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); - } - for (int g = 0; g < group_; ++g) { + const Dtype* weights, Dtype* input) { + cl_command_queue Queue; + if (is_1x1_) { + caffe_gpu_memcpy( + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, + (Dtype*) transMem); + } + for (int g = 0; g < group_; ++g) { #ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; #else - Queue = amdDevice.CommandQueue; + Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm < Dtype - > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, - (Dtype) 1., weights, weight_offset_ * g, - (Dtype*) subTopMem, top_offset_opt * g, - (Dtype) 0., (Dtype*) transMem, col_offset_ * g); - } + caffe_gpu_gemm < Dtype + > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype) 1., weights, weight_offset_ + * g, (Dtype*) subTopMem, top_offset_opt * g, (Dtype) 0., (Dtype*) transMem, col_offset_ + * g); + } #ifdef multiQ - if(group_ ==2) { - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + if(group_ ==2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #endif - if (!is_1x1_) { - conv_col2im_gpu_opt(input); - } else { - caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), - (Dtype*) transMem, input); - } + if (!is_1x1_) { + conv_col2im_gpu_opt(input); + } else { + caffe_gpu_memcpy( + height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), + (Dtype*) transMem, input); + } } template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, - const Dtype* output, Dtype* weights) { - cl_command_queue Queue; - if (!is_1x1_) { - conv_im2col_gpu_opt(input); - } else { - caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); - } - opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, - opt_num2); - - for (int g = 0; g < group_; ++g) { + const Dtype* output, Dtype* weights) { + cl_command_queue Queue; + if (!is_1x1_) { + conv_im2col_gpu_opt(input); + } else { + caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, + (Dtype*) transMem); + } + opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + + for (int g = 0; g < group_; ++g) { #ifdef multiQ - if(g == 0) Queue = amdDevice.CommandQueue; - else Queue = amdDevice.CommandQueue_helper; + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; #else - Queue = amdDevice.CommandQueue; + Queue = amdDevice.CommandQueue; #endif - caffe_gpu_gemm < Dtype - > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, - (Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g, - (Dtype*) transMem, col_offset_ * g, (Dtype) 1., - (Dtype*) weights, weight_offset_ * g); + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, (Dtype) 1., (Dtype*) subTopMem, top_offset_opt + * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ + * g); #ifdef multiQ - if(group_ == 2) { - clFinish(amdDevice.CommandQueue); - clFinish(amdDevice.CommandQueue_helper); - } + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } #endif - } + } } // end: code is written/modified by AMD diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index b0c0ebf2..d02e92c4 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -9,111 +9,107 @@ namespace caffe { template BaseDataLayer::BaseDataLayer(const LayerParameter& param) -: - Layer(param), - transform_param_(param.transform_param()) { + : Layer(param), transform_param_(param.transform_param()) { } template void BaseDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - if (top.size() == 1) { - output_labels_ = false; - } else { - output_labels_ = true; - } - data_transformer_.reset( - new DataTransformer(transform_param_, this->phase_)); - data_transformer_->InitRand(); - // The subclasses should setup the size of bottom and top - DataLayerSetUp(bottom, top); + const vector*>& top) { + if (top.size() == 1) { + output_labels_ = false; + } else { + output_labels_ = true; + } + data_transformer_.reset( + new DataTransformer(transform_param_, this->phase_)); + data_transformer_->InitRand(); + // The subclasses should setup the size of bottom and top + DataLayerSetUp(bottom, top); } template void BasePrefetchingDataLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - BaseDataLayer < Dtype > ::LayerSetUp(bottom, top); - // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make - // simultaneous cudaMalloc calls when the main thread is running. In some - // GPUs this seems to cause failures if we do not so. - this->prefetch_data_.mutable_cpu_data(); - if (this->output_labels_) { - this->prefetch_label_.mutable_cpu_data(); - } - DLOG(INFO) << "Initializing prefetch"; - this->CreatePrefetchThread(); - DLOG(INFO) << "Prefetch initialized."; + const vector*>& bottom, const vector*>& top) { + BaseDataLayer < Dtype > ::LayerSetUp(bottom, top); + // Now, start the prefetch thread. Before calling prefetch, we make two + // cpu_data calls so that the prefetch thread does not accidentally make + // simultaneous cudaMalloc calls when the main thread is running. In some + // GPUs this seems to cause failures if we do not so. + this->prefetch_data_.mutable_cpu_data(); + if (this->output_labels_) { + this->prefetch_label_.mutable_cpu_data(); + } + DLOG(INFO) << "Initializing prefetch"; + this->CreatePrefetchThread(); + DLOG(INFO) << "Prefetch initialized."; } template void BasePrefetchingDataLayer::CreatePrefetchThread() { - this->data_transformer_->InitRand(); - CHECK(StartInternalThread()) << "Thread execution failed"; + this->data_transformer_->InitRand(); + CHECK(StartInternalThread()) << "Thread execution failed"; } template void BasePrefetchingDataLayer::JoinPrefetchThread() { - CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; + CHECK(WaitForInternalThreadToExit()) << "Thread joining failed"; } template void BasePrefetchingDataLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // First, join the thread - JoinPrefetchThread(); - - DLOG(INFO) << "Thread joined"; - // Reshape to loaded data. - top[0]->ReshapeLike(prefetch_data_); - // Copy the data - caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_cpu_data()); - DLOG(INFO) << "Prefetch copied"; - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - // Copy the labels. - caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_cpu_data()); - } - // Start a new prefetch thread - DLOG(INFO) << "CreatePrefetchThread"; - CreatePrefetchThread(); + const vector*>& bottom, const vector*>& top) { + // First, join the thread + JoinPrefetchThread(); + + DLOG(INFO) << "Thread joined"; + // Reshape to loaded data. + top[0]->ReshapeLike(prefetch_data_); + // Copy the data + caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), + top[0]->mutable_cpu_data()); + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); + // Copy the labels. + caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), + top[1]->mutable_cpu_data()); + } + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); } template void BasePrefetchingDataLayer::Forward_gpu( - const vector*>& bottom, - const vector*>& top) { - - JoinPrefetchThread(); - DLOG(INFO) << "Thread joined"; - - top[0]->ReshapeLike(this->prefetch_data_); - OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, - sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, - NULL, NULL)); - DLOG(INFO) << "Prefetch copied"; - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, - sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), - 0, - NULL, NULL)); - } + const vector*>& bottom, const vector*>& top) { + + JoinPrefetchThread(); + DLOG(INFO) << "Thread joined"; + + top[0]->ReshapeLike(this->prefetch_data_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, + NULL, NULL)); + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), + 0, NULL, NULL)); + } #ifdef Track_data_transfer #endif - // Start a new prefetch thread - DLOG(INFO) << "CreatePrefetchThread"; - CreatePrefetchThread(); + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index ad422a11..c2cce9e3 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -10,58 +10,56 @@ const float kBNLL_THRESHOLD = 50.; template void BNLLLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = - bottom_data[i] > 0 ? - bottom_data[i] + log(1. + exp(-bottom_data[i])) : - log(1. + exp(bottom_data[i])); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = + bottom_data[i] > 0 ? + bottom_data[i] + log(1. + exp(-bottom_data[i])) : + log(1. + exp(bottom_data[i])); + } } template void BNLLLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype expval; - for (int i = 0; i < count; ++i) { - expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD))); - bottom_diff[i] = top_diff[i] * expval / (expval + 1.); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype expval; + for (int i = 0; i < count; ++i) { + expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD))); + bottom_diff[i] = top_diff[i] * expval / (expval + 1.); + } + } } // begin: code written/modified by AMD template void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLForward(count, bottom_data, top_data); } template void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLBackward(count, top_diff, bottom_data, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLBackward(count, top_diff, bottom_data, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 28aac6b2..5a351009 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -8,133 +8,135 @@ namespace caffe { template void ConcatLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const ConcatParameter& concat_param = this->layer_param_.concat_param(); - CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) - << "Either axis or concat_dim should be specified; not both."; + const vector*>& top) { + const ConcatParameter& concat_param = this->layer_param_.concat_param(); + CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) + << "Either axis or concat_dim should be specified; not both."; } template void ConcatLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int num_axes = bottom[0]->num_axes(); - const ConcatParameter& concat_param = this->layer_param_.concat_param(); - if (concat_param.has_concat_dim()) { - concat_axis_ = static_cast(concat_param.concat_dim()); - // Don't allow negative indexing for concat_dim, a uint32 -- almost - // certainly unintended. - CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 " - << "produced negative result; concat_dim must satisfy " - << "0 <= concat_dim < " << kMaxBlobAxes; - CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range."; - } else { - concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis()); - } - // Initialize with the first blob. - vector top_shape = bottom[0]->shape(); - num_concats_ = bottom[0]->count(0, concat_axis_); - concat_input_size_ = bottom[0]->count(concat_axis_ + 1); - int bottom_count_sum = bottom[0]->count(); - for (int i = 1; i < bottom.size(); ++i) { - CHECK_EQ(num_axes, bottom[i]->num_axes()) - << "All inputs must have the same #axes."; - for (int j = 0; j < num_axes; ++j) { - if (j == concat_axis_) { - continue; - } - CHECK_EQ(top_shape[j], bottom[i]->shape(j)) - << "All inputs must have the same shape, except at concat_axis."; - } - bottom_count_sum += bottom[i]->count(); - top_shape[concat_axis_] += bottom[i]->shape(concat_axis_); - } - top[0]->Reshape(top_shape); - CHECK_EQ(bottom_count_sum, top[0]->count()); + const vector*>& top) { + const int num_axes = bottom[0]->num_axes(); + const ConcatParameter& concat_param = this->layer_param_.concat_param(); + if (concat_param.has_concat_dim()) { + concat_axis_ = static_cast(concat_param.concat_dim()); + // Don't allow negative indexing for concat_dim, a uint32 -- almost + // certainly unintended. + CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 " + << "produced negative result; concat_dim must satisfy " + << "0 <= concat_dim < " << kMaxBlobAxes; + CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range."; + } else { + concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis()); + } + // Initialize with the first blob. + vector top_shape = bottom[0]->shape(); + num_concats_ = bottom[0]->count(0, concat_axis_); + concat_input_size_ = bottom[0]->count(concat_axis_ + 1); + int bottom_count_sum = bottom[0]->count(); + for (int i = 1; i < bottom.size(); ++i) { + CHECK_EQ(num_axes, bottom[i]->num_axes()) + << "All inputs must have the same #axes."; + for (int j = 0; j < num_axes; ++j) { + if (j == concat_axis_) { + continue; + } + CHECK_EQ(top_shape[j], bottom[i]->shape(j)) + << "All inputs must have the same shape, except at concat_axis."; + } + bottom_count_sum += bottom[i]->count(); + top_shape[concat_axis_] += bottom[i]->shape(concat_axis_); + } + top[0]->Reshape(top_shape); + CHECK_EQ(bottom_count_sum, top[0]->count()); } template void ConcatLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_cpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - for (int n = 0; n < num_concats_; ++n) { - caffe_copy(bottom_concat_axis * concat_input_size_, - bottom_data + n * bottom_concat_axis * concat_input_size_, - top_data + (n * top_concat_axis + offset_concat_axis) - * concat_input_size_); - } - offset_concat_axis += bottom_concat_axis; - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_cpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + for (int n = 0; n < num_concats_; ++n) { + caffe_copy(bottom_concat_axis * concat_input_size_, + bottom_data + n * bottom_concat_axis * concat_input_size_, + top_data + + (n * top_concat_axis + offset_concat_axis) + * concat_input_size_); + } + offset_concat_axis += bottom_concat_axis; + } } template void ConcatLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { - continue; - } - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - for (int n = 0; n < num_concats_; ++n) { - caffe_copy(bottom_concat_axis * concat_input_size_, top_diff + - (n * top_concat_axis + offset_concat_axis) * concat_input_size_, - bottom_diff + n * bottom_concat_axis * concat_input_size_); - } - offset_concat_axis += bottom_concat_axis; - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + for (int i = 0; i < bottom.size(); ++i) { + if (!propagate_down[i]) { + continue; + } + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + for (int n = 0; n < num_concats_; ++n) { + caffe_copy(bottom_concat_axis * concat_input_size_, + top_diff + + (n * top_concat_axis + offset_concat_axis) * concat_input_size_, + bottom_diff + n * bottom_concat_axis * concat_input_size_); + } + offset_concat_axis += bottom_concat_axis; + } } // begin: code written/modified by AMD template void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - if (bottom.size() == 1) { - return; - } - Dtype* top_data = top[0]->mutable_gpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = true; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); - offset_concat_axis += bottom_concat_axis; - } + const vector*>& top) { + if (bottom.size() == 1) { + return; + } + Dtype* top_data = top[0]->mutable_gpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = true; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); + offset_concat_axis += bottom_concat_axis; + } } template void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (bottom.size() == 1) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = false; - for (int i = 0; i < bottom.size(); ++i) { - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - if (propagate_down[i]) { - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); - } - offset_concat_axis += bottom_concat_axis; - } + const vector& propagate_down, const vector*>& bottom) { + if (bottom.size() == 1) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = false; + for (int i = 0; i < bottom.size(); ++i) { + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + if (propagate_down[i]) { + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + } + offset_concat_axis += bottom_concat_axis; + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index f6265726..a8e6f523 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -9,173 +9,151 @@ namespace caffe { template -void ContrastiveLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::LayerSetUp(bottom, top); - CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); - CHECK_EQ(bottom[0]->height(), 1); - CHECK_EQ(bottom[0]->width(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); - CHECK_EQ(bottom[2]->channels(), 1); - CHECK_EQ(bottom[2]->height(), 1); - CHECK_EQ(bottom[2]->width(), 1); - diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); - diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); - dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); - // vector of ones used to sum along channels - summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); - for (int i = 0; i < bottom[0]->channels(); ++i) - summer_vec_.mutable_cpu_data()[i] = Dtype(1); +void ContrastiveLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); + CHECK_EQ(bottom[0]->height(), 1); + CHECK_EQ(bottom[0]->width(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); + CHECK_EQ(bottom[2]->channels(), 1); + CHECK_EQ(bottom[2]->height(), 1); + CHECK_EQ(bottom[2]->width(), 1); + diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); + // vector of ones used to sum along channels + summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); + for (int i = 0; i < bottom[0]->channels(); ++i) + summer_vec_.mutable_cpu_data()[i] = Dtype(1); } template void ContrastiveLossLayer::Forward_cpu( - const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), // a - bottom[1]->cpu_data(), // b - diff_.mutable_cpu_data()); // a_i-b_i - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, - diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); - loss += dist * dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& bottom, const vector*>& top) { + int count = bottom[0]->count(); + caffe_sub(count, bottom[0]->cpu_data(), // a + bottom[1]->cpu_data(), // b + diff_.mutable_cpu_data()); // a_i-b_i + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, + diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist * dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void ContrastiveLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[i]->num()); - int num = bottom[i]->num(); - int channels = bottom[i]->channels(); - for (int j = 0; j < num; ++j) { - Dtype* bout = bottom[i]->mutable_cpu_diff(); - if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs - caffe_cpu_axpby( - channels, - alpha, - diff_.cpu_data() + (j * channels), - Dtype(0.0), - bout + (j * channels)); - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = margin - dist_sq_.cpu_data()[j]; - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq_.cpu_data()[j]); - mdist = margin - dist; - beta = -alpha * mdist / (dist + Dtype(1e-4)); - } - if (mdist > Dtype(0.0)) { - caffe_cpu_axpby( - channels, - beta, - diff_.cpu_data() + (j * channels), - Dtype(0.0), - bout + (j * channels)); - } else { - caffe_set(channels, Dtype(0), bout + (j * channels)); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] + / static_cast(bottom[i]->num()); + int num = bottom[i]->num(); + int channels = bottom[i]->channels(); + for (int j = 0; j < num; ++j) { + Dtype* bout = bottom[i]->mutable_cpu_diff(); + if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs + caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j * channels), + Dtype(0.0), bout + (j * channels)); + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = margin - dist_sq_.cpu_data()[j]; + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq_.cpu_data()[j]); + mdist = margin - dist; + beta = -alpha * mdist / (dist + Dtype(1e-4)); + } + if (mdist > Dtype(0.0)) { + caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j * channels), + Dtype(0.0), bout + (j * channels)); + } else { + caffe_set(channels, Dtype(0), bout + (j * channels)); + } + } + } + } + } } // begin: code written/modified by AMD template void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); - loss += dist * dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& bottom, const vector*>& top) { + const int count = bottom[0]->count(); + caffe_gpu_sub(count, bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i + caffe_gpu_powx(count, diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(), + Dtype(1.0), diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), Dtype(0.0), dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist * dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const int count = bottom[0]->count(); - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); - // NOLINT_NEXT_LINE(whitespace/operators) - CLLBackward(count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const int count = bottom[0]->count(); + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + const bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] + / static_cast(bottom[0]->num()); + // NOLINT_NEXT_LINE(whitespace/operators) + CLLBackward(count, channels, margin, legacy_version, alpha, + bottom[2]->gpu_data(), // pair similarity 0 or 1 + diff_.gpu_data(), // the cached eltwise difference between a and b + dist_sq_.gpu_data(), // the cached square distance between a and b + bottom[i]->mutable_gpu_diff()); + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 0a989f69..9c250c42 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -9,230 +9,221 @@ namespace caffe { template void ConvolutionLayer::compute_output_shape() { - this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_) - / this->stride_h_ + 1; - this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_) - / this->stride_w_ + 1; + this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_) + / this->stride_h_ + 1; + this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_) + / this->stride_w_ + 1; } template void ConvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->cpu_data(); - this->forward_cpu_bias(top_data + top[i]->offset(n), bias); - } - } - } - - // CHECK_BLOB_DATA(top[0],20, "top[0]"); + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, + top_data + top[i]->offset(n)); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->cpu_data(); + this->forward_cpu_bias(top_data + top[i]->offset(n), bias); + } + } + } + + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template void ConvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n), + top_diff + top[i]->offset(n), weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n)); + } + } + } + } } // begin: code written/modified by AMD template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - if (use_packing_scheme && global_packing_N > 1) - Forward_gpu_opt2(bottom, top); - else - Forward_gpu_org(bottom, top); + const vector*>& top) { + if (use_packing_scheme && global_packing_N > 1) + Forward_gpu_opt2(bottom, top); + else + Forward_gpu_org(bottom, top); } template void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (use_packing_scheme && global_packing_N > 1) - Backward_gpu_opt2(top, propagate_down, bottom); - else - Backward_gpu_org(top, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + if (use_packing_scheme && global_packing_N > 1) + Backward_gpu_opt2(top, propagate_down, bottom); + else + Backward_gpu_org(top, propagate_down, bottom); } template void ConvolutionLayer::Forward_gpu_opt2( - const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); - - Dtype* top_data = top[i]->mutable_gpu_data(); - this->opt_num2 = global_packing_N; - this->weight_offset_ = this->M_ * this->K_; - for (int n = 0; n < this->num_; n += this->opt_num2) { - this->opt_num2 = - this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; - //intermediate variables to pass offset - this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; - this->top_offset_ = top[i]->offset(n); - this->col_offset_ = this->K_ * this->N_ * this->opt_num2; - this->bottom_offset_ = bottom[i]->offset(n); - this->forward_gpu_gemm_opt(bottom_data, weight, - top_data); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias_opt(top_data, bias); - } - } - } - - //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - //CHECK_BLOB_DATA(top[0],20, "top[0]"); + const vector*>& bottom, const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + this->opt_num2 = global_packing_N; + this->weight_offset_ = this->M_ * this->K_; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + //intermediate variables to pass offset + this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_ * this->opt_num2; + this->bottom_offset_ = bottom[i]->offset(n); + this->forward_gpu_gemm_opt(bottom_data, weight, top_data); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias_opt(top_data, bias); + } + } + } + + //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } template void ConvolutionLayer::Forward_gpu_org( - const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); - - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - //two intermediate variables to pass offset - this->bottom_offset_ = bottom[i]->offset(n); - this->top_offset_ = top[i]->offset(n); - this->forward_gpu_gemm(bottom_data, weight, - top_data); - - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data, bias); - } - } - } - - // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); - //CHECK_BLOB_DATA(top[0],20, "top[0]"); + const vector*>& bottom, const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + //two intermediate variables to pass offset + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->forward_gpu_gemm(bottom_data, weight, top_data); + + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } + + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); } template void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - this->weight_offset_ = this->M_ * this->K_; - this->opt_num2 = global_packing_N; - for (int n = 0; n < this->num_; n += this->opt_num2) { - this->opt_num2 = - this->opt_num2 > (this->num_ - n) ? - (this->num_ - n) : - this->opt_num2; - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); - this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm_opt(bottom_data, - top_diff, weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm_opt(top_diff, weight, - bottom_diff); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + this->weight_offset_ = this->M_ * this->K_; + this->opt_num2 = global_packing_N; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? + (this->num_ - n) : this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm_opt(top_diff, weight, bottom_diff); + } + } + } + } } template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - // - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(bottom_data, - top_diff, weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm(top_diff, weight, - bottom_diff); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + // + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(bottom_data, top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm(top_diff, weight, bottom_diff); + } + } + } + } } // end: code written/modified by AMD diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index e9ee5221..fdae75a0 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -18,108 +18,108 @@ namespace caffe { template DataLayer::~DataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } template void DataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - // Initialize DB - db_.reset(db::GetDB(this->layer_param_.data_param().backend())); - db_->Open(this->layer_param_.data_param().source(), db::READ); - cursor_.reset(db_->NewCursor()); + const vector*>& top) { + // Initialize DB + db_.reset(db::GetDB(this->layer_param_.data_param().backend())); + db_->Open(this->layer_param_.data_param().source(), db::READ); + cursor_.reset(db_->NewCursor()); - // Check if we should randomly skip a few data points - if (this->layer_param_.data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.data_param().rand_skip(); - LOG(INFO) << "Skipping first " << skip << " data points."; - while (skip-- > 0) { - cursor_->Next(); - } - } - // Read a data point, to initialize the prefetch and top blobs. - Datum datum; - datum.ParseFromString(cursor_->value()); - // Use data_transformer to infer the expected blob shape from datum. - vector top_shape = this->data_transformer_->InferBlobShape(datum); - this->transformed_data_.Reshape(top_shape); - // Reshape top[0] and prefetch_data according to the batch_size. - top_shape[0] = this->layer_param_.data_param().batch_size(); - this->prefetch_data_.Reshape(top_shape); - top[0]->ReshapeLike(this->prefetch_data_); - this->prefetch_data_.set_data_layer(); + // Check if we should randomly skip a few data points + if (this->layer_param_.data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() + % this->layer_param_.data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + while (skip-- > 0) { + cursor_->Next(); + } + } + // Read a data point, to initialize the prefetch and top blobs. + Datum datum; + datum.ParseFromString(cursor_->value()); + // Use data_transformer to infer the expected blob shape from datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape top[0] and prefetch_data according to the batch_size. + top_shape[0] = this->layer_param_.data_param().batch_size(); + this->prefetch_data_.Reshape(top_shape); + top[0]->ReshapeLike(this->prefetch_data_); + this->prefetch_data_.set_data_layer(); - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - if (this->output_labels_) { - vector label_shape(1, this->layer_param_.data_param().batch_size()); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); - this->prefetch_label_.set_data_layer(); - } + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + if (this->output_labels_) { + vector label_shape(1, this->layer_param_.data_param().batch_size()); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); + this->prefetch_label_.set_data_layer(); + } } // This function is used to create a thread that prefetches the data. template void DataLayer::InternalThreadEntry() { - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - CHECK(this->prefetch_data_.count()); - CHECK(this->transformed_data_.count()); + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + CHECK(this->prefetch_data_.count()); + CHECK(this->transformed_data_.count()); - // Reshape according to the first datum of each batch - // on single input batches allows for inputs of varying dimension. - const int batch_size = this->layer_param_.data_param().batch_size(); - Datum datum; - datum.ParseFromString(cursor_->value()); - // Use data_transformer to infer the expected blob shape from datum. - vector top_shape = this->data_transformer_->InferBlobShape(datum); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data according to the batch_size. - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); + // Reshape according to the first datum of each batch + // on single input batches allows for inputs of varying dimension. + const int batch_size = this->layer_param_.data_param().batch_size(); + Datum datum; + datum.ParseFromString(cursor_->value()); + // Use data_transformer to infer the expected blob shape from datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data according to the batch_size. + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); - Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* top_label = NULL; // suppress warnings about uninitialized variables + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* top_label = NULL; // suppress warnings about uninitialized variables - if (this->output_labels_) { - top_label = this->prefetch_label_.mutable_cpu_data(); - } - timer.Start(); - for (int item_id = 0; item_id < batch_size; ++item_id) { - // get a datum - Datum datum; - datum.ParseFromString(cursor_->value()); - read_time += timer.MicroSeconds(); - timer.Start(); - // Apply data transformations (mirror, scale, crop...) - int offset = this->prefetch_data_.offset(item_id); - this->transformed_data_.set_cpu_data(top_data + offset); - this->data_transformer_->Transform(datum, &(this->transformed_data_)); - // Copy label. - if (this->output_labels_) { - top_label[item_id] = datum.label(); - } - trans_time += timer.MicroSeconds(); - timer.Start(); - // go to the next item. - cursor_->Next(); - if (!cursor_->valid()) { - DLOG(INFO) << "Restarting data prefetching from start."; - cursor_->SeekToFirst(); - } - } - timer.Stop(); - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + if (this->output_labels_) { + top_label = this->prefetch_label_.mutable_cpu_data(); + } + timer.Start(); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a datum + Datum datum; + datum.ParseFromString(cursor_->value()); + read_time += timer.MicroSeconds(); + timer.Start(); + // Apply data transformations (mirror, scale, crop...) + int offset = this->prefetch_data_.offset(item_id); + this->transformed_data_.set_cpu_data(top_data + offset); + this->data_transformer_->Transform(datum, &(this->transformed_data_)); + // Copy label. + if (this->output_labels_) { + top_label[item_id] = datum.label(); + } + trans_time += timer.MicroSeconds(); + timer.Start(); + // go to the next item. + cursor_->Next(); + if (!cursor_->valid()) { + DLOG(INFO) << "Restarting data prefetching from start."; + cursor_->SeekToFirst(); + } + } + timer.Stop(); + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } INSTANTIATE_CLASS (DataLayer); diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index 402a787e..8ee81c9f 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -10,119 +10,119 @@ namespace caffe { template void DeconvolutionLayer::compute_output_shape() { - this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_ - - 2 * this->pad_h_; - this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_ - - 2 * this->pad_w_; + this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_ + - 2 * this->pad_h_; + this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_ + - 2 * this->pad_w_; } template void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->cpu_data(); - this->forward_cpu_bias(top_data + top[i]->offset(n), bias); - } - } - } + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight, + top_data + top[i]->offset(n)); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->cpu_data(); + this->forward_cpu_bias(top_data + top[i]->offset(n), bias); + } + } + } } template void DeconvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // Gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_cpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // Gradient w.r.t. bottom data, if necessary, reusing the column buffer - // we might have just computed above. - if (propagate_down[i]) { - this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n), - this->param_propagate_down_[0]); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->cpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + // Gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_cpu_gemm(top_diff + top[i]->offset(n), + bottom_data + bottom[i]->offset(n), weight_diff); + } + // Gradient w.r.t. bottom data, if necessary, reusing the column buffer + // we might have just computed above. + if (propagate_down[i]) { + this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n), + this->param_propagate_down_[0]); + } + } + } + } } template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->bottom_offset_ = bottom[i]->offset(n); - this->top_offset_ = top[i]->offset(n); - this->backward_gpu_gemm(bottom_data, weight, top_data); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data, bias); - } - } - } + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_gemm(bottom_data, weight, top_data); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } } template void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(top_diff + top[i]->offset(n), + bottom_data + bottom[i]->offset(n), weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, + bottom_diff + bottom[i]->offset(n)); + } + } + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 6692f238..f717fdbb 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -12,69 +12,67 @@ namespace caffe { template void DropoutLayer::ocl_setup(int bottom_count) { - MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - bottom_count * sizeof(int), NULL, NULL); + MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + bottom_count * sizeof(int), NULL, NULL); } template DropoutLayer::~DropoutLayer() { - OCL_CHECK (clReleaseMemObject(MaskMem) ); - }template + OCL_CHECK (clReleaseMemObject(MaskMem) );}template void DropoutLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - threshold_ = this->layer_param_.dropout_param().dropout_ratio(); - DCHECK(threshold_ > 0.); - DCHECK(threshold_ < 1.); - scale_ = 1. / (1. - threshold_); - uint_thres_ = static_cast(UINT_MAX * threshold_); - ocl_setup(bottom[0]->count()); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.dropout_param().dropout_ratio(); + DCHECK(threshold_ > 0.); + DCHECK(threshold_ < 1.); + scale_ = 1. / (1. - threshold_); + uint_thres_ = static_cast(UINT_MAX * threshold_); + ocl_setup(bottom[0]->count()); } template void DropoutLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::Reshape(bottom, top); - // Set up the cache for random number generation - rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + const vector*>& top) { + NeuronLayer < Dtype > ::Reshape(bottom, top); + // Set up the cache for random number generation + rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); } template void DropoutLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - unsigned int* mask = rand_vec_.mutable_cpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - // Create random numbers - caffe_rng_bernoulli(count, 1. - threshold_, mask); - for (int i = 0; i < count; ++i) { - top_data[i] = bottom_data[i] * mask[i] * scale_; - } - } else { - caffe_copy(bottom[0]->count(), bottom_data, top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + unsigned int* mask = rand_vec_.mutable_cpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + // Create random numbers + caffe_rng_bernoulli(count, 1. - threshold_, mask); + for (int i = 0; i < count; ++i) { + top_data[i] = bottom_data[i] * mask[i] * scale_; + } + } else { + caffe_copy(bottom[0]->count(), bottom_data, top_data); + } } template void DropoutLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - if (this->phase_ == TRAIN) { - const unsigned int* mask = rand_vec_.cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - bottom_diff[i] = top_diff[i] * mask[i] * scale_; - } - } else { - caffe_copy(top[0]->count(), top_diff, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + if (this->phase_ == TRAIN) { + const unsigned int* mask = rand_vec_.cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + bottom_diff[i] = top_diff[i] * mask[i] * scale_; + } + } else { + caffe_copy(top[0]->count(), top_diff, bottom_diff); + } + } } #define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\ @@ -97,49 +95,48 @@ do{ \ // begin: code is written/modified by AMD template void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - //unsigned int* mask = - // static_cast(rand_vec_.mutable_gpu_data()); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + //unsigned int* mask = + // static_cast(rand_vec_.mutable_gpu_data()); #ifdef use_cpu_generator_dropout - unsigned int* mask_cpu = - static_cast(rand_vec_.mutable_cpu_data()); - caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); - OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); - DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); + unsigned int* mask_cpu = + static_cast(rand_vec_.mutable_cpu_data()); + caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); + OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); + DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); #else - caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1., - threshold_); - DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_, - top_data); + caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1., + threshold_); + DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_, + top_data); #endif - } else { - if(bottom_data != top_data) - caffe_gpu_copy(count, bottom_data, top_data); - } -CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); + } else { + if (bottom_data != top_data) + caffe_gpu_copy(count, bottom_data, top_data); + } + CHECK_GLOBAL_INT_MEM_DATA((int* )MaskMem, bottom[0]->count(), 20, "Mask"); } template void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const int count = bottom[0]->count(); - DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_, - (Dtype) scale_, bottom_diff); - } else { - caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); - } - CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); - CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff"); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (this->phase_ == TRAIN) { + const int count = bottom[0]->count(); + DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_, + (Dtype) scale_, bottom_diff); + } else { + caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); + } + CHECK_GLOBAL_INT_MEM_DATA((int* )MaskMem, bottom[0]->count(), 20, "Mask"); + CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff"); + } } // end: code is written/modified by AMD #ifdef CPU_ONLY diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index a5225ea6..f13f3be1 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -8,105 +8,106 @@ namespace caffe { template void DummyDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int num_top = top.size(); - const DummyDataParameter& param = this->layer_param_.dummy_data_param(); - const int num_data_filler = param.data_filler_size(); - CHECK(num_data_filler == 0 || num_data_filler == 1 || - num_data_filler == num_top) - << "Number of data fillers must be 0, 1 or equal to the number of tops: " - << num_top << "; you specified " << num_data_filler << " data fillers."; + const vector*>& top) { + const int num_top = top.size(); + const DummyDataParameter& param = this->layer_param_.dummy_data_param(); + const int num_data_filler = param.data_filler_size(); + CHECK( + num_data_filler == 0 || num_data_filler == 1 + || num_data_filler == num_top) + << "Number of data fillers must be 0, 1 or equal to the number of tops: " + << num_top << "; you specified " << num_data_filler << " data fillers."; - const bool legacy_dims = param.num_size() || param.channels_size() || - param.height_size() || param.width_size(); - if (legacy_dims) { - CHECK_EQ(0, param.shape_size()) - << "Both shape and legacy fields were specified"; - // Using deprecated 4D output dim specifiers. - CHECK(param.num_size() == 1 || param.num_size() == num_top) - << "Must specify 'num' once, or once per top blob " - << "(" << num_top << "); specified " << param.num_size() << "."; - CHECK(param.channels_size() == 1 || param.channels_size() == num_top) - << "Must specify 'channels' once, or once per top blob " - << "(" << num_top << "); specified " << param.channels_size() << "."; - CHECK(param.height_size() == 1 || param.height_size() == num_top) - << "Must specify 'height' once, or once per top blob " - << "(" << num_top << "); specified " << param.height_size() << "."; - CHECK(param.width_size() == 1 || param.width_size() == num_top) - << "Must specify 'width' once, or once per top blob " - << "(" << num_top << "); specified " << param.width_size() << "."; - } else { - CHECK(param.shape_size() == 1 || param.shape_size() == num_top) - << "Must specify 'shape' once, or once per top blob " - << "(" << num_top << "); specified " << param.shape_size() << "."; - } - // refill_[i] tells Forward i whether or not to actually refill top Blob i. - // If refill_[i] is false, Forward does nothing for Blob i. We use this to - // avoid wastefully refilling "constant" Blobs in every forward pass. - // We first fill refill_ in with the INVERSE of its final values. - // The first time we run Forward from the LayerSetUp method, we'll fill only - // Blobs for which refill_ is normally false. These Blobs will never be - // filled again. - refill_.clear(); - fillers_.clear(); - if (num_data_filler <= 1) { - FillerParameter filler_param; - if (num_data_filler == 0) { - filler_param.set_type("constant"); - filler_param.set_value(0); - } else { - filler_param.CopyFrom(param.data_filler(0)); - } - // Refill on each iteration iff not using a constant filler, - // but use the inverse of this rule for the first run. - refill_.resize(1); - refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0); - fillers_.resize(1); - fillers_[0].reset(GetFiller < Dtype > (filler_param)); - } else { - refill_.resize(num_top); - fillers_.resize(num_top); - for (int i = 0; i < num_top; ++i) { - fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i))); - // Refill on each iteration iff not using a constant filler, - // but use the inverse of this rule for the first run. - refill_[i] = - (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); - } - } - for (int i = 0; i < num_top; ++i) { - if (legacy_dims) { - const int num = (param.num_size() == 1) ? param.num(0) : param.num(i); - const int channels = - (param.channels_size() == 1) ? param.channels(0) : param.channels(i); - const int height = - (param.height_size() == 1) ? param.height(0) : param.height(i); - const int width = - (param.width_size() == 1) ? param.width(0) : param.width(i); - top[i]->Reshape(num, channels, height, width); - } else { - const int shape_index = (param.shape_size() == 1) ? 0 : i; - top[i]->Reshape(param.shape(shape_index)); - } - } - // Run Forward once, with refill_ inverted, to fill the constant Blobs. - this->Forward(bottom, top); - // Invert the inverted refill_ values to refill the desired (non-constant) - // Blobs in every usual forward pass. - for (int i = 0; i < refill_.size(); ++i) { - refill_[i] = !refill_[i]; - } + const bool legacy_dims = param.num_size() || param.channels_size() + || param.height_size() || param.width_size(); + if (legacy_dims) { + CHECK_EQ(0, param.shape_size()) + << "Both shape and legacy fields were specified"; + // Using deprecated 4D output dim specifiers. + CHECK(param.num_size() == 1 || param.num_size() == num_top) + << "Must specify 'num' once, or once per top blob " << "(" << num_top + << "); specified " << param.num_size() << "."; + CHECK(param.channels_size() == 1 || param.channels_size() == num_top) + << "Must specify 'channels' once, or once per top blob " << "(" + << num_top << "); specified " << param.channels_size() << "."; + CHECK(param.height_size() == 1 || param.height_size() == num_top) + << "Must specify 'height' once, or once per top blob " << "(" << num_top + << "); specified " << param.height_size() << "."; + CHECK(param.width_size() == 1 || param.width_size() == num_top) + << "Must specify 'width' once, or once per top blob " << "(" << num_top + << "); specified " << param.width_size() << "."; + } else { + CHECK(param.shape_size() == 1 || param.shape_size() == num_top) + << "Must specify 'shape' once, or once per top blob " << "(" << num_top + << "); specified " << param.shape_size() << "."; + } + // refill_[i] tells Forward i whether or not to actually refill top Blob i. + // If refill_[i] is false, Forward does nothing for Blob i. We use this to + // avoid wastefully refilling "constant" Blobs in every forward pass. + // We first fill refill_ in with the INVERSE of its final values. + // The first time we run Forward from the LayerSetUp method, we'll fill only + // Blobs for which refill_ is normally false. These Blobs will never be + // filled again. + refill_.clear(); + fillers_.clear(); + if (num_data_filler <= 1) { + FillerParameter filler_param; + if (num_data_filler == 0) { + filler_param.set_type("constant"); + filler_param.set_value(0); + } else { + filler_param.CopyFrom(param.data_filler(0)); + } + // Refill on each iteration iff not using a constant filler, + // but use the inverse of this rule for the first run. + refill_.resize(1); + refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0); + fillers_.resize(1); + fillers_[0].reset(GetFiller < Dtype > (filler_param)); + } else { + refill_.resize(num_top); + fillers_.resize(num_top); + for (int i = 0; i < num_top; ++i) { + fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i))); + // Refill on each iteration iff not using a constant filler, + // but use the inverse of this rule for the first run. + refill_[i] = + (strcmp(param.data_filler(i).type().c_str(), "constant") == 0); + } + } + for (int i = 0; i < num_top; ++i) { + if (legacy_dims) { + const int num = (param.num_size() == 1) ? param.num(0) : param.num(i); + const int channels = + (param.channels_size() == 1) ? param.channels(0) : param.channels(i); + const int height = + (param.height_size() == 1) ? param.height(0) : param.height(i); + const int width = + (param.width_size() == 1) ? param.width(0) : param.width(i); + top[i]->Reshape(num, channels, height, width); + } else { + const int shape_index = (param.shape_size() == 1) ? 0 : i; + top[i]->Reshape(param.shape(shape_index)); + } + } + // Run Forward once, with refill_ inverted, to fill the constant Blobs. + this->Forward(bottom, top); + // Invert the inverted refill_ values to refill the desired (non-constant) + // Blobs in every usual forward pass. + for (int i = 0; i < refill_.size(); ++i) { + refill_[i] = !refill_[i]; + } } template void DummyDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - const int filler_id = (fillers_.size() > 1) ? i : 0; - if (refill_[filler_id]) { - fillers_[filler_id]->Fill(top[i]); - } - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + const int filler_id = (fillers_.size() > 1) ? i : 0; + if (refill_[filler_id]) { + fillers_[filler_id]->Fill(top[i]); + } + } } INSTANTIATE_CLASS (DummyDataLayer); diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index b904ad39..e2e5e1ab 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -9,236 +9,236 @@ namespace caffe { template void EltwiseLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK(this->layer_param().eltwise_param().coeff_size() == 0 - || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << - "Eltwise Layer takes one coefficient per bottom blob."; - CHECK(!(this->layer_param().eltwise_param().operation() - == EltwiseParameter_EltwiseOp_PROD - && this->layer_param().eltwise_param().coeff_size())) << - "Eltwise layer only takes coefficients for summation."; - op_ = this->layer_param_.eltwise_param().operation(); - // Blob-wise coefficients for the elementwise operation. - coeffs_ = vector < Dtype > (bottom.size(), 1); - if (this->layer_param().eltwise_param().coeff_size()) { - for (int i = 0; i < bottom.size(); ++i) { - coeffs_[i] = this->layer_param().eltwise_param().coeff(i); - } - } - stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad(); + const vector*>& top) { + CHECK( + this->layer_param().eltwise_param().coeff_size() == 0 + || this->layer_param().eltwise_param().coeff_size() == bottom.size()) + << "Eltwise Layer takes one coefficient per bottom blob."; + CHECK( + !(this->layer_param().eltwise_param().operation() + == EltwiseParameter_EltwiseOp_PROD + && this->layer_param().eltwise_param().coeff_size())) + << "Eltwise layer only takes coefficients for summation."; + op_ = this->layer_param_.eltwise_param().operation(); + // Blob-wise coefficients for the elementwise operation. + coeffs_ = vector < Dtype > (bottom.size(), 1); + if (this->layer_param().eltwise_param().coeff_size()) { + for (int i = 0; i < bottom.size(); ++i) { + coeffs_[i] = this->layer_param().eltwise_param().coeff(i); + } + } + stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad(); } template void EltwiseLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - for (int i = 1; i < bottom.size(); ++i) { - CHECK(bottom[i]->shape() == bottom[0]->shape()); - } - top[0]->ReshapeLike(*bottom[0]); - // If max operation, we will initialize the vector index part. - if (this->layer_param_.eltwise_param().operation() == - EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { - max_idx_.Reshape(bottom[0]->shape()); - } + const vector*>& top) { + for (int i = 1; i < bottom.size(); ++i) { + CHECK(bottom[i]->shape() == bottom[0]->shape()); + } + top[0]->ReshapeLike(*bottom[0]); + // If max operation, we will initialize the vector index part. + if (this->layer_param_.eltwise_param().operation() + == EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { + max_idx_.Reshape(bottom[0]->shape()); + } } template -void EltwiseLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - int* mask = NULL; - const Dtype* bottom_data_a = NULL; - const Dtype* bottom_data_b = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_cpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_set(count, Dtype(0), top_data); - // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - // Initialize - mask = max_idx_.mutable_cpu_data(); - caffe_set(count, -1, mask); - caffe_set(count, Dtype(-FLT_MAX), top_data); - // bottom 0 & 1 - bottom_data_a = bottom[0]->cpu_data(); - bottom_data_b = bottom[1]->cpu_data(); - for (int idx = 0; idx < count; ++idx) { - if (bottom_data_a[idx] > bottom_data_b[idx]) { - top_data[idx] = bottom_data_a[idx]; // maxval - mask[idx] = 0; // maxid - } else { - top_data[idx] = bottom_data_b[idx]; // maxval - mask[idx] = 1; // maxid - } - } - // bottom 2++ - for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) { - bottom_data_b = bottom[blob_idx]->cpu_data(); - for (int idx = 0; idx < count; ++idx) { - if (bottom_data_b[idx] > top_data[idx]) { - top_data[idx] = bottom_data_b[idx]; // maxval - mask[idx] = blob_idx; // maxid - } - } - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } +void EltwiseLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + int* mask = NULL; + const Dtype* bottom_data_a = NULL; + const Dtype* bottom_data_b = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_cpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_set(count, Dtype(0), top_data); + // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + // Initialize + mask = max_idx_.mutable_cpu_data(); + caffe_set(count, -1, mask); + caffe_set(count, Dtype(-FLT_MAX), top_data); + // bottom 0 & 1 + bottom_data_a = bottom[0]->cpu_data(); + bottom_data_b = bottom[1]->cpu_data(); + for (int idx = 0; idx < count; ++idx) { + if (bottom_data_a[idx] > bottom_data_b[idx]) { + top_data[idx] = bottom_data_a[idx]; // maxval + mask[idx] = 0; // maxid + } else { + top_data[idx] = bottom_data_b[idx]; // maxval + mask[idx] = 1; // maxid + } + } + // bottom 2++ + for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) { + bottom_data_b = bottom[blob_idx]->cpu_data(); + for (int idx = 0; idx < count; ++idx) { + if (bottom_data_b[idx] > top_data[idx]) { + top_data[idx] = bottom_data_b[idx]; // maxval + mask[idx] = blob_idx; // maxid + } + } + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } } template void EltwiseLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { - continue; - } - if (!initialized) { - caffe_copy(count, bottom[j]->cpu_data(), bottom_diff); - initialized = true; - } else { - caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_div(count, top_data, bottom_data, bottom_diff); - } - caffe_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.cpu_data(); - for (int index = 0; index < count; ++index) { - Dtype gradient = 0; - if (mask[index] == i) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { + continue; + } + if (!initialized) { + caffe_copy(count, bottom[j]->cpu_data(), bottom_diff); + initialized = true; + } else { + caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, bottom_diff); + } + } + } else { + caffe_div(count, top_data, bottom_data, bottom_diff); + } + caffe_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1)) { + caffe_copy(count, top_diff, bottom_diff); + } else { + caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.cpu_data(); + for (int index = 0; index < count; ++index) { + Dtype gradient = 0; + if (mask[index] == i) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } } // begin: code written/modified by AMD template void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int* mask = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.mutable_gpu_data(); - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, - top_data, mask); - for (int i = 2; i < bottom.size(); ++i) { - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, - mask); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } + const vector*>& top) { + int* mask = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_gpu_set(count, Dtype(0.), top_data); + // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.mutable_gpu_data(); + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, + mask); + for (int i = 2; i < bottom.size(); ++i) { + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, mask); + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } } template void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { - continue; - } - if (!initialized) { - caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); - initialized = true; - } else { - caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - } - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_gpu_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.gpu_data(); - MaxBackward(count, top_diff, i, mask, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { + continue; + } + if (!initialized) { + caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); + initialized = true; + } else { + caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, + bottom_diff); + } + } + } else { + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + } + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1.)) { + caffe_gpu_copy(count, top_diff, bottom_diff); + } else { + caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.gpu_data(); + MaxBackward(count, top_diff, i, mask, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 9107f119..fce99953 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -8,76 +8,68 @@ namespace caffe { template -void EuclideanLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::Reshape(bottom, top); - CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) - << "Inputs must have the same dimension."; - diff_.ReshapeLike(*bottom[0]); +void EuclideanLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) + << "Inputs must have the same dimension."; + diff_.ReshapeLike(*bottom[0]); } template void EuclideanLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), - diff_.mutable_cpu_data()); - Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& top) { + int count = bottom[0]->count(); + caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), + diff_.mutable_cpu_data()); + Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void EuclideanLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_cpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.cpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_cpu_diff()); // b - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_cpu_axpby(bottom[i]->count(), // count + alpha, // alpha + diff_.cpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_cpu_diff()); // b + } + } } // begin: code written/modified by AMD template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; + const vector*>& top) { + int count = bottom[0]->count(); + caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); + Dtype dot; + caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; } template void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_gpu_axpby(bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 087da677..25bcd0a0 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -9,90 +9,90 @@ namespace caffe { template void ExpLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - const Dtype base = this->layer_param_.exp_param().base(); - if (base != Dtype(-1)) { - CHECK_GT(base, 0) << "base must be strictly positive."; - } - // If base == -1, interpret the base as e and set log_base = 1 exactly. - // Otherwise, calculate its log explicitly. - const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; - const Dtype input_scale = this->layer_param_.exp_param().scale(); - const Dtype input_shift = this->layer_param_.exp_param().shift(); - inner_scale_ = log_base * input_scale; - outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + const Dtype base = this->layer_param_.exp_param().base(); + if (base != Dtype(-1)) { + CHECK_GT(base, 0) << "base must be strictly positive."; + } + // If base == -1, interpret the base as e and set log_base = 1 exactly. + // Otherwise, calculate its log explicitly. + const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); + CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = " + << log_base; + CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = " + << log_base; + const Dtype input_scale = this->layer_param_.exp_param().scale(); + const Dtype input_shift = this->layer_param_.exp_param().shift(); + inner_scale_ = log_base * input_scale; + outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift); } template void ExpLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_exp(count, bottom_data, top_data); - } else { - caffe_cpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_scal(count, outer_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_exp(count, bottom_data, top_data); + } else { + caffe_cpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_scal(count, outer_scale_, top_data); + } } template void ExpLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_scal(count, inner_scale_, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_scal(count, inner_scale_, bottom_diff); + } } // begin: code written/modified by AMD template void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_gpu_exp(count, bottom_data, top_data); + } else { + caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_gpu_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_gpu_scal(count, outer_scale_, top_data); + } } template void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_gpu_scal(count, inner_scale_, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index 05dc2783..fc3ca142 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -9,172 +9,172 @@ namespace caffe { template void FilterLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(top.size(), bottom.size() - 1); - first_reshape_ = true; + const vector*>& top) { + CHECK_EQ(top.size(), bottom.size() - 1); + first_reshape_ = true; } template void FilterLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - // bottom[0...k-1] are the blobs to filter - // bottom[last] is the "selector_blob" - int selector_index = bottom.size() - 1; - for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) { - CHECK_EQ(bottom[selector_index]->shape(i), 1) - << "Selector blob dimensions must be singletons (1), except the first"; - } - for (int i = 0; i < bottom.size() - 1; ++i) { - CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) << - "Each bottom should have the same 0th dimension as the selector blob"; - } + const vector*>& top) { + // bottom[0...k-1] are the blobs to filter + // bottom[last] is the "selector_blob" + int selector_index = bottom.size() - 1; + for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) { + CHECK_EQ(bottom[selector_index]->shape(i), 1) + << "Selector blob dimensions must be singletons (1), except the first"; + } + for (int i = 0; i < bottom.size() - 1; ++i) { + CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) + << "Each bottom should have the same 0th dimension as the selector blob"; + } - const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); - indices_to_forward_.clear(); + const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); + indices_to_forward_.clear(); - // look for non-zero elements in bottom[0]. Items of each bottom that - // have the same index as the items in bottom[0] with value == non-zero - // will be forwarded - for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) { - // we don't need an offset because item size == 1 - const Dtype* tmp_data_selector = bottom_data_selector + item_id; - if (*tmp_data_selector) { - indices_to_forward_.push_back(item_id); - } - } - // only filtered items will be forwarded - int new_tops_num = indices_to_forward_.size(); - // init - if (first_reshape_) { - new_tops_num = bottom[0]->shape(0); - first_reshape_ = false; - } - for (int t = 0; t < top.size(); ++t) { - int num_axes = bottom[t]->num_axes(); - vector shape_top(num_axes); - shape_top[0] = new_tops_num; - for (int ts = 1; ts < num_axes; ++ts) - shape_top[ts] = bottom[t]->shape(ts); - top[t]->Reshape(shape_top); - } + // look for non-zero elements in bottom[0]. Items of each bottom that + // have the same index as the items in bottom[0] with value == non-zero + // will be forwarded + for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) { + // we don't need an offset because item size == 1 + const Dtype* tmp_data_selector = bottom_data_selector + item_id; + if (*tmp_data_selector) { + indices_to_forward_.push_back(item_id); + } + } + // only filtered items will be forwarded + int new_tops_num = indices_to_forward_.size(); + // init + if (first_reshape_) { + new_tops_num = bottom[0]->shape(0); + first_reshape_ = false; + } + for (int t = 0; t < top.size(); ++t) { + int num_axes = bottom[t]->num_axes(); + vector shape_top(num_axes); + shape_top[0] = new_tops_num; + for (int ts = 1; ts < num_axes; ++ts) + shape_top[ts] = bottom[t]->shape(ts); + top[t]->Reshape(shape_top); + } } template void FilterLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->cpu_data(); - Dtype* top_data = top[t]->mutable_cpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1); - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->cpu_data(); + Dtype* top_data = top[t]->mutable_cpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1); + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } } template void FilterLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); i++) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); n++) { - data_offset_bottom = n * dim; - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - if (n != batch_offset) { // this data was not been forwarded - caffe_set(dim, Dtype(0), - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - next_to_backward_offset++; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top, - bottom[i]->mutable_cpu_diff() + data_offset_bottom); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); i++) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); n++) { + data_offset_bottom = n * dim; + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + caffe_set(dim, Dtype(0), + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + if (n != batch_offset) { // this data was not been forwarded + caffe_set(dim, Dtype(0), + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + next_to_backward_offset++; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top, + bottom[i]->mutable_cpu_diff() + data_offset_bottom); + } + } + } + } + } } // begin: code written/modified by AMD template void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->gpu_data(); - Dtype* top_data = top[t]->mutable_gpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * dim; - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->gpu_data(); + Dtype* top_data = top[t]->mutable_gpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * dim; + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } } template void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); ++i) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); ++i) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); ++n) { + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + data_offset_bottom = n * dim; + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + data_offset_bottom = n * dim; + if (n != batch_offset) { // this data was not been forwarded + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + ++next_to_backward_offset; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } + } + } + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index e79e9406..997f213d 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -8,34 +8,34 @@ namespace caffe { template void FlattenLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int start_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().axis()); - const int end_axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.flatten_param().end_axis()); - vector top_shape; - for (int i = 0; i < start_axis; ++i) { - top_shape.push_back(bottom[0]->shape(i)); - } - const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1); - top_shape.push_back(flattened_dim); - for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) { - top_shape.push_back(bottom[0]->shape(i)); - } - top[0]->Reshape(top_shape); - CHECK_EQ(top[0]->count(), bottom[0]->count()); + const vector*>& top) { + const int start_axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.flatten_param().axis()); + const int end_axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.flatten_param().end_axis()); + vector top_shape; + for (int i = 0; i < start_axis; ++i) { + top_shape.push_back(bottom[0]->shape(i)); + } + const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1); + top_shape.push_back(flattened_dim); + for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) { + top_shape.push_back(bottom[0]->shape(i)); + } + top[0]->Reshape(top_shape); + CHECK_EQ(top[0]->count(), bottom[0]->count()); } template void FlattenLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - top[0]->ShareData(*bottom[0]); + const vector*>& top) { + top[0]->ShareData(*bottom[0]); } template void FlattenLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - bottom[0]->ShareDiff(*top[0]); + const vector& propagate_down, const vector*>& bottom) { + bottom[0]->ShareDiff(*top[0]); } INSTANTIATE_CLASS (FlattenLayer); diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 6c6d8dec..2d7d405e 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -27,175 +27,174 @@ HDF5DataLayer::~HDF5DataLayer() { // Load data and label from HDF5 filename into the class property blobs. template void HDF5DataLayer::LoadHDF5FileData(const char* filename) { - DLOG(INFO) << "Loading HDF5 file: " << filename; - hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT); - if (file_id < 0) { - LOG(FATAL) << "Failed opening HDF5 file: " << filename; - } - - int top_size = this->layer_param_.top_size(); - hdf_blobs_.resize(top_size); - - const int MIN_DATA_DIM = 1; - const int MAX_DATA_DIM = INT_MAX; - - for (int i = 0; i < top_size; ++i) { - hdf_blobs_[i] = shared_ptr < Blob > (new Blob()); - hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), - MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); - } - - herr_t status = H5Fclose(file_id); - CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename; - - // MinTopBlobs==1 guarantees at least one top blob - CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis."; - const int num = hdf_blobs_[0]->shape(0); - for (int i = 1; i < top_size; ++i) { - CHECK_EQ(hdf_blobs_[i]->shape(0), num); - } - // Default to identity permutation. - data_permutation_.clear(); - data_permutation_.resize(hdf_blobs_[0]->shape(0)); - for (int i = 0; i < hdf_blobs_[0]->shape(0); i++) - data_permutation_[i] = i; - - // Shuffle if needed. - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) - << " rows (shuffled)"; - } else { - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; - } + DLOG(INFO) << "Loading HDF5 file: " << filename; + hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT); + if (file_id < 0) { + LOG(FATAL) << "Failed opening HDF5 file: " << filename; + } + + int top_size = this->layer_param_.top_size(); + hdf_blobs_.resize(top_size); + + const int MIN_DATA_DIM = 1; + const int MAX_DATA_DIM = INT_MAX; + + for (int i = 0; i < top_size; ++i) { + hdf_blobs_[i] = shared_ptr < Blob > (new Blob()); + hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), + MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); + } + + herr_t status = H5Fclose(file_id); + CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename; + + // MinTopBlobs==1 guarantees at least one top blob + CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis."; + const int num = hdf_blobs_[0]->shape(0); + for (int i = 1; i < top_size; ++i) { + CHECK_EQ(hdf_blobs_[i]->shape(0), num); + } + // Default to identity permutation. + data_permutation_.clear(); + data_permutation_.resize(hdf_blobs_[0]->shape(0)); + for (int i = 0; i < hdf_blobs_[0]->shape(0); i++) + data_permutation_[i] = i; + + // Shuffle if needed. + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) + << " rows (shuffled)"; + } else { + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; + } } template void HDF5DataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - // Refuse transformation parameters since HDF5 is totally generic. - CHECK(!this->layer_param_.has_transform_param()) << - this->type() << " does not transform data."; - // Read the source to parse the filenames. - const string& source = this->layer_param_.hdf5_data_param().source(); - LOG(INFO) << "Loading list of HDF5 filenames from: " << source; - hdf_filenames_.clear(); - std::ifstream source_file(source.c_str()); - if (source_file.is_open()) { - std::string line; - while (source_file >> line) { - hdf_filenames_.push_back(line); - } - } else { - LOG(FATAL) << "Failed to open source file: " << source; - } - source_file.close(); - num_files_ = hdf_filenames_.size(); - current_file_ = 0; - LOG(INFO) << "Number of HDF5 files: " << num_files_; - CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " - << source; - - file_permutation_.clear(); - file_permutation_.resize(num_files_); - // Default to identity permutation. - for (int i = 0; i < num_files_; i++) { - file_permutation_[i] = i; - } - - // Shuffle if needed. - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); - } - - // Load the first HDF5 file and initialize the line counter. - LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str()); - current_row_ = 0; - - // Reshape blobs. - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - const int top_size = this->layer_param_.top_size(); - vector top_shape; - for (int i = 0; i < top_size; ++i) { - top_shape.resize(hdf_blobs_[i]->num_axes()); - top_shape[0] = batch_size; - for (int j = 1; j < top_shape.size(); ++j) { - top_shape[j] = hdf_blobs_[i]->shape(j); - } - top[i]->Reshape(top_shape); - } + const vector*>& top) { + // Refuse transformation parameters since HDF5 is totally generic. + CHECK(!this->layer_param_.has_transform_param()) << this->type() + << " does not transform data."; + // Read the source to parse the filenames. + const string& source = this->layer_param_.hdf5_data_param().source(); + LOG(INFO) << "Loading list of HDF5 filenames from: " << source; + hdf_filenames_.clear(); + std::ifstream source_file(source.c_str()); + if (source_file.is_open()) { + std::string line; + while (source_file >> line) { + hdf_filenames_.push_back(line); + } + } else { + LOG(FATAL) << "Failed to open source file: " << source; + } + source_file.close(); + num_files_ = hdf_filenames_.size(); + current_file_ = 0; + LOG(INFO) << "Number of HDF5 files: " << num_files_; + CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " + << source; + + file_permutation_.clear(); + file_permutation_.resize(num_files_); + // Default to identity permutation. + for (int i = 0; i < num_files_; i++) { + file_permutation_[i] = i; + } + + // Shuffle if needed. + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); + } + + // Load the first HDF5 file and initialize the line counter. + LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str()); + current_row_ = 0; + + // Reshape blobs. + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + const int top_size = this->layer_param_.top_size(); + vector top_shape; + for (int i = 0; i < top_size; ++i) { + top_shape.resize(hdf_blobs_[i]->num_axes()); + top_shape[0] = batch_size; + for (int j = 1; j < top_shape.size(); ++j) { + top_shape[j] = hdf_blobs_[i]->shape(j); + } + top[i]->Reshape(top_shape); + } } template void HDF5DataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - ++current_file_; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); - } - } + const vector*>& top) { + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + ++current_file_; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + caffe_copy(data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], + &top[j]->mutable_cpu_data()[i * data_dim]); + } + } } // begin: code written/modified by AMD template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - current_file_ += 1; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - OCL_CHECK( - clEnqueueWriteBuffer(amdDevice.CommandQueue, - (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, - i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], - 0, NULL, NULL)); - //caffe_copy(data_dim, - // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); - } - } + const vector*>& top) { + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + current_file_ += 1; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, + i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], 0, NULL, NULL)); + //caffe_copy(data_dim, + // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index a8c062bc..f9215a3d 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -13,94 +13,94 @@ namespace caffe { template void HDF5OutputLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - file_name_ = this->layer_param_.hdf5_output_param().file_name(); - file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, - H5P_DEFAULT); - CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; - file_opened_ = true; + const vector*>& top) { + file_name_ = this->layer_param_.hdf5_output_param().file_name(); + file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, + H5P_DEFAULT); + CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; + file_opened_ = true; } template HDF5OutputLayer::~HDF5OutputLayer() { - if (file_opened_) { - herr_t status = H5Fclose(file_id_); - CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_; - } + if (file_opened_) { + herr_t status = H5Fclose(file_id_); + CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_; + } } template void HDF5OutputLayer::SaveBlobs() { - // TODO: no limit on the number of blobs - LOG(INFO) << "Saving HDF5 file " << file_name_; - CHECK_EQ(data_blob_.num(), label_blob_.num()) << - "data blob and label blob must have the same batch size"; - hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); - hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); - LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; + // TODO: no limit on the number of blobs + LOG(INFO) << "Saving HDF5 file " << file_name_; + CHECK_EQ(data_blob_.num(), label_blob_.num()) + << "data blob and label blob must have the same batch size"; + hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); + hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); + LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; } template void HDF5OutputLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + const vector*>& top) { + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); - } - SaveBlobs(); + for (int i = 0; i < bottom[0]->num(); ++i) { + caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], + &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], + &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + } + SaveBlobs(); } template void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; + const vector& propagate_down, const vector*>& bottom) { + return; } // begin: code written/modified by AMD template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + const vector*>& top) { + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - for (int i = 0; i < bottom[0]->num(); ++i) { - OCL_CHECK( - clEnqueueReadBuffer(amdDevice.CommandQueue, - (cl_mem) bottom[0]->gpu_data(), CL_TRUE, - i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, - &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); - OCL_CHECK( - clEnqueueReadBuffer(amdDevice.CommandQueue, - (cl_mem) bottom[1]->gpu_data(), CL_TRUE, - i * label_datum_dim * sizeof(Dtype), - sizeof(Dtype) * label_datum_dim, - &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, - NULL)); - } - SaveBlobs(); + for (int i = 0; i < bottom[0]->num(); ++i) { + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[0]->gpu_data(), CL_TRUE, + i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, + &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[1]->gpu_data(), CL_TRUE, + i * label_datum_dim * sizeof(Dtype), + sizeof(Dtype) * label_datum_dim, + &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, + NULL)); + } + SaveBlobs(); } template void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; + const vector& propagate_down, const vector*>& bottom) { + return; } #ifdef CPU_ONLY diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index d415bd64..b2259859 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -12,68 +12,68 @@ namespace caffe { template void HingeLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int count = bottom[0]->count(); - int dim = count / num; + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int count = bottom[0]->count(); + int dim = count / num; - caffe_copy(count, bottom_data, bottom_diff); - for (int i = 0; i < num; ++i) { - bottom_diff[i * dim + static_cast(label[i])] *= -1; - } - for (int i = 0; i < num; ++i) { - for (int j = 0; j < dim; ++j) { - bottom_diff[i * dim + j] = std::max( - Dtype(0), 1 + bottom_diff[i * dim + j]); - } - } - Dtype* loss = top[0]->mutable_cpu_data(); - switch (this->layer_param_.hinge_loss_param().norm()) { - case HingeLossParameter_Norm_L1: - loss[0] = caffe_cpu_asum(count, bottom_diff) / num; - break; - case HingeLossParameter_Norm_L2: - loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; - break; - default: - LOG(FATAL) << "Unknown Norm"; - } + caffe_copy(count, bottom_data, bottom_diff); + for (int i = 0; i < num; ++i) { + bottom_diff[i * dim + static_cast(label[i])] *= -1; + } + for (int i = 0; i < num; ++i) { + for (int j = 0; j < dim; ++j) { + bottom_diff[i * dim + j] = std::max(Dtype(0), + 1 + bottom_diff[i * dim + j]); + } + } + Dtype* loss = top[0]->mutable_cpu_data(); + switch (this->layer_param_.hinge_loss_param().norm()) { + case HingeLossParameter_Norm_L1: + loss[0] = caffe_cpu_asum(count, bottom_diff) / num; + break; + case HingeLossParameter_Norm_L2: + loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; + break; + default: + LOG(FATAL) << "Unknown Norm"; + } } template void HingeLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int count = bottom[0]->count(); - int dim = count / num; + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int count = bottom[0]->count(); + int dim = count / num; - for (int i = 0; i < num; ++i) { - bottom_diff[i * dim + static_cast(label[i])] *= -1; - } + for (int i = 0; i < num; ++i) { + bottom_diff[i * dim + static_cast(label[i])] *= -1; + } - const Dtype loss_weight = top[0]->cpu_diff()[0]; - switch (this->layer_param_.hinge_loss_param().norm()) { - case HingeLossParameter_Norm_L1: - caffe_cpu_sign(count, bottom_diff, bottom_diff); - caffe_scal(count, loss_weight / num, bottom_diff); - break; - case HingeLossParameter_Norm_L2: - caffe_scal(count, loss_weight * 2 / num, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown Norm"; - } - } + const Dtype loss_weight = top[0]->cpu_diff()[0]; + switch (this->layer_param_.hinge_loss_param().norm()) { + case HingeLossParameter_Norm_L1: + caffe_cpu_sign(count, bottom_diff, bottom_diff); + caffe_scal(count, loss_weight / num, bottom_diff); + break; + case HingeLossParameter_Norm_L2: + caffe_scal(count, loss_weight * 2 / num, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown Norm"; + } + } } INSTANTIATE_CLASS (HingeLossLayer); diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index a8ddc7fe..886782b9 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -9,104 +9,106 @@ namespace caffe { template void Im2colLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - if (conv_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = conv_param.kernel_size(); - } else { - kernel_h_ = conv_param.kernel_h(); - kernel_w_ = conv_param.kernel_w(); - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!conv_param.has_pad_h()) { - pad_h_ = pad_w_ = conv_param.pad(); - } else { - pad_h_ = conv_param.pad_h(); - pad_w_ = conv_param.pad_w(); - } - if (!conv_param.has_stride_h()) { - stride_h_ = stride_w_ = conv_param.stride(); - } else { - stride_h_ = conv_param.stride_h(); - stride_w_ = conv_param.stride_w(); - } + const vector*>& top) { + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK( + !conv_param.has_kernel_size() + != !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK( + conv_param.has_kernel_size() + || (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK( + (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK( + (!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } } template void Im2colLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - top[0]->Reshape( - bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, - (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, - (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + top[0]->Reshape(bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, + (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, + (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); } template void Im2colLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, + top_data + top[0]->offset(n)); + } } template void Im2colLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, + bottom_diff + bottom[0]->offset(n)); + } } template void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data, top[0]->offset(n)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, top_data, + top[0]->offset(n)); + } } template void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n)); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff, + bottom[0]->offset(n)); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 24ac8ffc..21957551 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -17,140 +17,141 @@ namespace caffe { template ImageDataLayer::~ImageDataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } template void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int new_height = this->layer_param_.image_data_param().new_height(); - const int new_width = this->layer_param_.image_data_param().new_width(); - const bool is_color = this->layer_param_.image_data_param().is_color(); - string root_folder = this->layer_param_.image_data_param().root_folder(); - - CHECK((new_height == 0 && new_width == 0) || - (new_height > 0 && new_width > 0)) << "Current implementation requires " - "new_height and new_width to be set at the same time."; - // Read the file with filenames and labels - const string& source = this->layer_param_.image_data_param().source(); - LOG(INFO) << "Opening file " << source; - std::ifstream infile(source.c_str()); - string filename; - int label; - while (infile >> filename >> label) { - lines_.push_back(std::make_pair(filename, label)); - } - - if (this->layer_param_.image_data_param().shuffle()) { - // randomly shuffle data - LOG(INFO) << "Shuffling data"; - const unsigned int prefetch_rng_seed = caffe_rng_rand(); - prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); - ShuffleImages(); - } - LOG(INFO) << "A total of " << lines_.size() << " images."; - - lines_id_ = 0; - // Check if we would need to randomly skip a few data points - if (this->layer_param_.image_data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.image_data_param().rand_skip(); - LOG(INFO) << "Skipping first " << skip << " data points."; - CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; - lines_id_ = skip; - } - // Read an image, and use it to initialize the top blob. - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - // Use data_transformer to infer the expected blob shape from a cv_image. - vector top_shape = this->data_transformer_->InferBlobShape(cv_img); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data and top[0] according to the batch_size. - const int batch_size = this->layer_param_.image_data_param().batch_size(); - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); - top[0]->ReshapeLike(this->prefetch_data_); - - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - vector label_shape(1, batch_size); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); + const vector*>& top) { + const int new_height = this->layer_param_.image_data_param().new_height(); + const int new_width = this->layer_param_.image_data_param().new_width(); + const bool is_color = this->layer_param_.image_data_param().is_color(); + string root_folder = this->layer_param_.image_data_param().root_folder(); + + CHECK( + (new_height == 0 && new_width == 0) || (new_height > 0 && new_width > 0)) + << "Current implementation requires " + "new_height and new_width to be set at the same time."; + // Read the file with filenames and labels + const string& source = this->layer_param_.image_data_param().source(); + LOG(INFO) << "Opening file " << source; + std::ifstream infile(source.c_str()); + string filename; + int label; + while (infile >> filename >> label) { + lines_.push_back(std::make_pair(filename, label)); + } + + if (this->layer_param_.image_data_param().shuffle()) { + // randomly shuffle data + LOG(INFO) << "Shuffling data"; + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + ShuffleImages(); + } + LOG(INFO) << "A total of " << lines_.size() << " images."; + + lines_id_ = 0; + // Check if we would need to randomly skip a few data points + if (this->layer_param_.image_data_param().rand_skip()) { + unsigned int skip = caffe_rng_rand() + % this->layer_param_.image_data_param().rand_skip(); + LOG(INFO) << "Skipping first " << skip << " data points."; + CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; + lines_id_ = skip; + } + // Read an image, and use it to initialize the top blob. + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + // Use data_transformer to infer the expected blob shape from a cv_image. + vector top_shape = this->data_transformer_->InferBlobShape(cv_img); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data and top[0] according to the batch_size. + const int batch_size = this->layer_param_.image_data_param().batch_size(); + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); + top[0]->ReshapeLike(this->prefetch_data_); + + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + vector label_shape(1, batch_size); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); } template void ImageDataLayer::ShuffleImages() { - caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); - shuffle(lines_.begin(), lines_.end(), prefetch_rng); + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + shuffle(lines_.begin(), lines_.end(), prefetch_rng); } // This function is used to create a thread that prefetches the data. template void ImageDataLayer::InternalThreadEntry() { - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - CHECK(this->prefetch_data_.count()); - CHECK(this->transformed_data_.count()); - ImageDataParameter image_data_param = this->layer_param_.image_data_param(); - const int batch_size = image_data_param.batch_size(); - const int new_height = image_data_param.new_height(); - const int new_width = image_data_param.new_width(); - const bool is_color = image_data_param.is_color(); - string root_folder = image_data_param.root_folder(); - - // Reshape according to the first image of each batch - // on single input batches allows for inputs of varying dimension. - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - // Use data_transformer to infer the expected blob shape from a cv_img. - vector top_shape = this->data_transformer_->InferBlobShape(cv_img); - this->transformed_data_.Reshape(top_shape); - // Reshape prefetch_data according to the batch_size. - top_shape[0] = batch_size; - this->prefetch_data_.Reshape(top_shape); - - Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data(); - - // datum scales - const int lines_size = lines_.size(); - for (int item_id = 0; item_id < batch_size; ++item_id) { - // get a blob - timer.Start(); - CHECK_GT(lines_size, lines_id_); - cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); - CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first; - read_time += timer.MicroSeconds(); - timer.Start(); - // Apply transformations (mirror, crop...) to the image - int offset = this->prefetch_data_.offset(item_id); - this->transformed_data_.set_cpu_data(prefetch_data + offset); - this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); - trans_time += timer.MicroSeconds(); - - prefetch_label[item_id] = lines_[lines_id_].second; - // go to the next iter - lines_id_++; - if (lines_id_ >= lines_size) { - // We have reached the end. Restart from the first. - DLOG(INFO) << "Restarting data prefetching from start."; - lines_id_ = 0; - if (this->layer_param_.image_data_param().shuffle()) { - ShuffleImages(); - } - } - } - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + CHECK(this->prefetch_data_.count()); + CHECK(this->transformed_data_.count()); + ImageDataParameter image_data_param = this->layer_param_.image_data_param(); + const int batch_size = image_data_param.batch_size(); + const int new_height = image_data_param.new_height(); + const int new_width = image_data_param.new_width(); + const bool is_color = image_data_param.is_color(); + string root_folder = image_data_param.root_folder(); + + // Reshape according to the first image of each batch + // on single input batches allows for inputs of varying dimension. + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + // Use data_transformer to infer the expected blob shape from a cv_img. + vector top_shape = this->data_transformer_->InferBlobShape(cv_img); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data according to the batch_size. + top_shape[0] = batch_size; + this->prefetch_data_.Reshape(top_shape); + + Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data(); + + // datum scales + const int lines_size = lines_.size(); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a blob + timer.Start(); + CHECK_GT(lines_size, lines_id_); + cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, + new_height, new_width, is_color); + CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first; + read_time += timer.MicroSeconds(); + timer.Start(); + // Apply transformations (mirror, crop...) to the image + int offset = this->prefetch_data_.offset(item_id); + this->transformed_data_.set_cpu_data(prefetch_data + offset); + this->data_transformer_->Transform(cv_img, &(this->transformed_data_)); + trans_time += timer.MicroSeconds(); + + prefetch_label[item_id] = lines_[lines_id_].second; + // go to the next iter + lines_id_++; + if (lines_id_ >= lines_size) { + // We have reached the end. Restart from the first. + DLOG(INFO) << "Restarting data prefetching from start."; + lines_id_ = 0; + if (this->layer_param_.image_data_param().shuffle()) { + ShuffleImages(); + } + } + } + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } INSTANTIATE_CLASS (ImageDataLayer); diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index 21414224..ffd2ab97 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -11,97 +11,96 @@ namespace caffe { template -void InfogainLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::LayerSetUp(bottom, top); - if (bottom.size() < 3) { - CHECK(this->layer_param_.infogain_loss_param().has_source()) - << "Infogain matrix source must be specified."; - BlobProto blob_proto; - ReadProtoFromBinaryFile( - this->layer_param_.infogain_loss_param().source(), &blob_proto); - infogain_.FromProto(blob_proto); - } +void InfogainLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + if (bottom.size() < 3) { + CHECK(this->layer_param_.infogain_loss_param().has_source()) + << "Infogain matrix source must be specified."; + BlobProto blob_proto; + ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(), + &blob_proto); + infogain_.FromProto(blob_proto); + } } template -void InfogainLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::Reshape(bottom, top); - Blob < Dtype > *infogain = NULL; - if (bottom.size() < 3) { - infogain = &infogain_; - } else { - infogain = bottom[2]; - } - CHECK_EQ(bottom[1]->channels(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); - const int num = bottom[0]->num(); - const int dim = bottom[0]->count() / num; - CHECK_EQ(infogain->num(), 1); - CHECK_EQ(infogain->channels(), 1); - CHECK_EQ(infogain->height(), dim); - CHECK_EQ(infogain->width(), dim); +void InfogainLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + Blob < Dtype > *infogain = NULL; + if (bottom.size() < 3) { + infogain = &infogain_; + } else { + infogain = bottom[2]; + } + CHECK_EQ(bottom[1]->channels(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); + const int num = bottom[0]->num(); + const int dim = bottom[0]->count() / num; + CHECK_EQ(infogain->num(), 1); + CHECK_EQ(infogain->channels(), 1); + CHECK_EQ(infogain->height(), dim); + CHECK_EQ(infogain->width(), dim); } template void InfogainLossLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = NULL; - if (bottom.size() < 3) { - infogain_mat = infogain_.cpu_data(); - } else { - infogain_mat = bottom[2]->cpu_data(); - } - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - Dtype loss = 0; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - for (int j = 0; j < dim; ++j) { - Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); - loss -= infogain_mat[label * dim + j] * log(prob); - } - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* infogain_mat = NULL; + if (bottom.size() < 3) { + infogain_mat = infogain_.cpu_data(); + } else { + infogain_mat = bottom[2]->cpu_data(); + } + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + Dtype loss = 0; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + for (int j = 0; j < dim; ++j) { + Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); + loss -= infogain_mat[label * dim + j] * log(prob); + } + } + top[0]->mutable_cpu_data()[0] = loss / num; } template void InfogainLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down.size() > 2 && propagate_down[2]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to infogain inputs."; - } - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = NULL; - if (bottom.size() < 3) { - infogain_mat = infogain_.cpu_data(); - } else { - infogain_mat = bottom[2]->cpu_data(); - } - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - const Dtype scale = -top[0]->cpu_diff()[0] / num; - for (int i = 0; i < num; ++i) { - const int label = static_cast(bottom_label[i]); - for (int j = 0; j < dim; ++j) { - Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); - bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob; - } - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down.size() > 2 && propagate_down[2]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to infogain inputs."; + } + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* infogain_mat = NULL; + if (bottom.size() < 3) { + infogain_mat = infogain_.cpu_data(); + } else { + infogain_mat = bottom[2]->cpu_data(); + } + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + const Dtype scale = -top[0]->cpu_diff()[0] / num; + for (int i = 0; i < num; ++i) { + const int label = static_cast(bottom_label[i]); + for (int j = 0; j < dim; ++j) { + Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD)); + bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob; + } + } + } } INSTANTIATE_CLASS (InfogainLossLayer); diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 3beca42f..b9ae3370 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -11,159 +11,150 @@ namespace caffe { template void InnerProductLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const int num_output = this->layer_param_.inner_product_param().num_output(); - bias_term_ = this->layer_param_.inner_product_param().bias_term(); - N_ = num_output; - const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); - // Dimensions starting from "axis" are "flattened" into a single - // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), - // and axis == 1, N inner products with dimension CHW are performed. - K_ = bottom[0]->count(axis); - // Check if we need to set up the weights - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - if (bias_term_) { - this->blobs_.resize(2); - } else { - this->blobs_.resize(1); - } - // Intialize the weight - vector weight_shape(2); - weight_shape[0] = N_; - weight_shape[1] = K_; - this->blobs_[0].reset(new Blob(weight_shape)); - // fill the weights - shared_ptr < Filler > weight_filler(GetFiller < Dtype > ( - this->layer_param_.inner_product_param().weight_filler())); - weight_filler->Fill(this->blobs_[0].get()); - // If necessary, intiialize and fill the bias term - if (bias_term_) { - vector bias_shape(1, N_); - this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr < Filler > bias_filler(GetFiller < Dtype > ( - this->layer_param_.inner_product_param().bias_filler())); - bias_filler->Fill(this->blobs_[1].get()); - } - } // parameter initialization - this->param_propagate_down_.resize(this->blobs_.size(), true); + const vector*>& top) { + const int num_output = this->layer_param_.inner_product_param().num_output(); + bias_term_ = this->layer_param_.inner_product_param().bias_term(); + N_ = num_output; + const int axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.inner_product_param().axis()); + // Dimensions starting from "axis" are "flattened" into a single + // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W), + // and axis == 1, N inner products with dimension CHW are performed. + K_ = bottom[0]->count(axis); + // Check if we need to set up the weights + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + if (bias_term_) { + this->blobs_.resize(2); + } else { + this->blobs_.resize(1); + } + // Intialize the weight + vector weight_shape(2); + weight_shape[0] = N_; + weight_shape[1] = K_; + this->blobs_[0].reset(new Blob(weight_shape)); + // fill the weights + shared_ptr < Filler + > weight_filler( + GetFiller < Dtype + > (this->layer_param_.inner_product_param().weight_filler())); + weight_filler->Fill(this->blobs_[0].get()); + // If necessary, intiialize and fill the bias term + if (bias_term_) { + vector bias_shape(1, N_); + this->blobs_[1].reset(new Blob(bias_shape)); + shared_ptr < Filler + > bias_filler( + GetFiller < Dtype + > (this->layer_param_.inner_product_param().bias_filler())); + bias_filler->Fill(this->blobs_[1].get()); + } + } // parameter initialization + this->param_propagate_down_.resize(this->blobs_.size(), true); } template void InnerProductLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - // Figure out the dimensions - const int axis = bottom[0]->CanonicalAxisIndex( - this->layer_param_.inner_product_param().axis()); - const int new_K = bottom[0]->count(axis); - CHECK_EQ(K_, new_K) - << "Input size incompatible with inner product parameters."; - // The first "axis" dimensions are independent inner products; the total - // number of these is M_, the product over these dimensions. - M_ = bottom[0]->count(0, axis); - // The top shape will be the bottom shape with the flattened axes dropped, - // and replaced by a single axis with dimension num_output (N_). - vector top_shape = bottom[0]->shape(); - top_shape.resize(axis + 1); - top_shape[axis] = N_; - top[0]->Reshape(top_shape); - // Set up the bias multiplier - if (bias_term_) { - vector bias_shape(1, M_); - bias_multiplier_.Reshape(bias_shape); - caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data()); - } + const vector*>& top) { + // Figure out the dimensions + const int axis = bottom[0]->CanonicalAxisIndex( + this->layer_param_.inner_product_param().axis()); + const int new_K = bottom[0]->count(axis); + CHECK_EQ(K_, new_K) + << "Input size incompatible with inner product parameters."; + // The first "axis" dimensions are independent inner products; the total + // number of these is M_, the product over these dimensions. + M_ = bottom[0]->count(0, axis); + // The top shape will be the bottom shape with the flattened axes dropped, + // and replaced by a single axis with dimension num_output (N_). + vector top_shape = bottom[0]->shape(); + top_shape.resize(axis + 1); + top_shape[axis] = N_; + top[0]->Reshape(top_shape); + // Set up the bias multiplier + if (bias_term_) { + vector bias_shape(1, M_); + bias_multiplier_.Reshape(bias_shape); + caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data()); + } } template void InnerProductLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., - bottom_data, weight, (Dtype) 0., top_data); - if (bias_term_) { - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., - bias_multiplier_.cpu_data(), - this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const Dtype* weight = this->blobs_[0]->cpu_data(); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, weight, (Dtype) 0., top_data); + if (bias_term_) { + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.cpu_data(), this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); + } } template void InnerProductLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - // Gradient with respect to weight - caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., - top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->cpu_diff(); - // Gradient with respect to bias - caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff, - bias_multiplier_.cpu_data(), (Dtype) 1., - this->blobs_[1]->mutable_cpu_diff()); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - // Gradient with respect to bottom data - caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., - bottom[0]->mutable_cpu_diff()); - } + const vector& propagate_down, const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Gradient with respect to weight + caffe_cpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->cpu_diff(); + // Gradient with respect to bias + caffe_cpu_gemv < Dtype + > (CblasTrans, M_, N_, (Dtype) 1., top_diff, bias_multiplier_.cpu_data(), (Dtype) 1., this->blobs_[1]->mutable_cpu_diff()); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + // Gradient with respect to bottom data + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., bottom[0]->mutable_cpu_diff()); + } } template void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., - bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); - if (bias_term_) { - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., - bias_multiplier_.gpu_data(), 0, - this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); + if (bias_term_) { + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.gpu_data(), 0, this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); + } } template void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm < Dtype - > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., - top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - caffe_gpu_gemv < Dtype - > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, - (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), - (size_t) 0, (Dtype) 0., 1, - this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - caffe_gpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., - top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., - bottom[0]->mutable_gpu_diff(), 0); - } + const vector& propagate_down, const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Gradient with respect to weight + caffe_gpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bias + caffe_gpu_gemv < Dtype + > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 0., 1, this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bottom data + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 60b08d99..f6ace662 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -9,121 +9,121 @@ namespace caffe { template void LogLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - const Dtype base = this->layer_param_.log_param().base(); - if (base != Dtype(-1)) { - CHECK_GT(base, 0) << "base must be strictly positive."; - } - // If base == -1, interpret the base as e and set log_base = 1 exactly. - // Otherwise, calculate its log explicitly. - const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; - base_scale_ = Dtype(1) / log_base; - CHECK(!isnan(base_scale_)) - << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; - CHECK(!isinf(base_scale_)) - << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; - input_scale_ = this->layer_param_.log_param().scale(); - input_shift_ = this->layer_param_.log_param().shift(); - backward_num_scale_ = input_scale_ / log_base; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + const Dtype base = this->layer_param_.log_param().base(); + if (base != Dtype(-1)) { + CHECK_GT(base, 0) << "base must be strictly positive."; + } + // If base == -1, interpret the base as e and set log_base = 1 exactly. + // Otherwise, calculate its log explicitly. + const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); + CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = " + << log_base; + CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = " + << log_base; + base_scale_ = Dtype(1) / log_base; + CHECK(!isnan(base_scale_)) << "NaN result: 1/log(base) = 1/log(" << base + << ") = " << base_scale_; + CHECK(!isinf(base_scale_)) << "Inf result: 1/log(base) = 1/log(" << base + << ") = " << base_scale_; + input_scale_ = this->layer_param_.log_param().scale(); + input_shift_ = this->layer_param_.log_param().shift(); + backward_num_scale_ = input_scale_ / log_base; } template void LogLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_add_scalar(count, input_shift_, top_data); - } - caffe_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_scal(count, base_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_log(count, bottom_data, top_data); + } else { + caffe_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_add_scalar(count, input_shift_, top_data); + } + caffe_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_scal(count, base_scale_, top_data); + } } template void LogLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_add_scalar(count, input_shift_, bottom_diff); - } - caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_scal(count, backward_num_scale_, bottom_diff); - } - caffe_mul(count, top_diff, bottom_diff, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_add_scalar(count, input_shift_, bottom_diff); + } + caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_scal(count, backward_num_scale_, bottom_diff); + } + caffe_mul(count, top_diff, bottom_diff, bottom_diff); } template void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_gpu_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_gpu_log(count, bottom_data, top_data); + } else { + caffe_gpu_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, top_data); + } + caffe_gpu_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_gpu_scal(count, base_scale_, top_data); + } } template void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, bottom_diff); + } + caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_gpu_scal(count, backward_num_scale_, bottom_diff); + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp index f5da913a..64abbaa0 100644 --- a/src/caffe/layers/loss_layer.cpp +++ b/src/caffe/layers/loss_layer.cpp @@ -11,21 +11,21 @@ namespace caffe { template -void LossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - // LossLayers have a non-zero (1) loss by default. - if (this->layer_param_.loss_weight_size() == 0) { - this->layer_param_.add_loss_weight(Dtype(1)); - } +void LossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + // LossLayers have a non-zero (1) loss by default. + if (this->layer_param_.loss_weight_size() == 0) { + this->layer_param_.add_loss_weight(Dtype(1)); + } } template -void LossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - CHECK_EQ(bottom[0]->num(), bottom[1]->num()) - << "The data and label should have the same number."; - vector loss_shape(0); // Loss layers output a scalar; 0 axes. - top[0]->Reshape(loss_shape); +void LossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + CHECK_EQ(bottom[0]->num(), bottom[1]->num()) + << "The data and label should have the same number."; + vector loss_shape(0); // Loss layers output a scalar; 0 axes. + top[0]->Reshape(loss_shape); } INSTANTIATE_CLASS (LossLayer); diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 2dfcd645..00e554bd 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -10,303 +10,304 @@ namespace caffe { template void LRNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - size_ = this->layer_param_.lrn_param().local_size(); - CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; - pre_pad_ = (size_ - 1) / 2; - alpha_ = this->layer_param_.lrn_param().alpha(); - beta_ = this->layer_param_.lrn_param().beta(); - k_ = this->layer_param_.lrn_param().k(); - if (this->layer_param_.lrn_param().norm_region() == - LRNParameter_NormRegion_WITHIN_CHANNEL) { - // Set up split_layer_ to use inputs in the numerator and denominator. - split_top_vec_.clear(); - split_top_vec_.push_back(&product_input_); - split_top_vec_.push_back(&square_input_); - LayerParameter split_param; - split_layer_.reset(new SplitLayer(split_param)); - split_layer_->SetUp(bottom, split_top_vec_); - // Set up square_layer_ to square the inputs. - square_bottom_vec_.clear(); - square_top_vec_.clear(); - square_bottom_vec_.push_back(&square_input_); - square_top_vec_.push_back(&square_output_); - LayerParameter square_param; - square_param.mutable_power_param()->set_power(Dtype(2)); - square_layer_.reset(new PowerLayer(square_param)); - square_layer_->SetUp(square_bottom_vec_, square_top_vec_); - // Set up pool_layer_ to sum over square neighborhoods of the input. - pool_top_vec_.clear(); - pool_top_vec_.push_back(&pool_output_); - LayerParameter pool_param; - pool_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - pool_param.mutable_pooling_param()->set_pad(pre_pad_); - pool_param.mutable_pooling_param()->set_kernel_size(size_); - pool_layer_.reset(new PoolingLayer(pool_param)); - pool_layer_->SetUp(square_top_vec_, pool_top_vec_); - // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is - // the sum of a squared neighborhood (the output of pool_layer_). - power_top_vec_.clear(); - power_top_vec_.push_back(&power_output_); - LayerParameter power_param; - power_param.mutable_power_param()->set_power(-beta_); - power_param.mutable_power_param()->set_scale(alpha_); - power_param.mutable_power_param()->set_shift(Dtype(1)); - power_layer_.reset(new PowerLayer(power_param)); - power_layer_->SetUp(pool_top_vec_, power_top_vec_); - // Set up a product_layer_ to compute outputs by multiplying inputs by the - // inverse demoninator computed by the power layer. - product_bottom_vec_.clear(); - product_bottom_vec_.push_back(&product_input_); - product_bottom_vec_.push_back(&power_output_); - LayerParameter product_param; - EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param(); - eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD); - product_layer_.reset(new EltwiseLayer(product_param)); - product_layer_->SetUp(product_bottom_vec_, top); - } + const vector*>& top) { + size_ = this->layer_param_.lrn_param().local_size(); + CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; + pre_pad_ = (size_ - 1) / 2; + alpha_ = this->layer_param_.lrn_param().alpha(); + beta_ = this->layer_param_.lrn_param().beta(); + k_ = this->layer_param_.lrn_param().k(); + if (this->layer_param_.lrn_param().norm_region() + == LRNParameter_NormRegion_WITHIN_CHANNEL) { + // Set up split_layer_ to use inputs in the numerator and denominator. + split_top_vec_.clear(); + split_top_vec_.push_back(&product_input_); + split_top_vec_.push_back(&square_input_); + LayerParameter split_param; + split_layer_.reset(new SplitLayer(split_param)); + split_layer_->SetUp(bottom, split_top_vec_); + // Set up square_layer_ to square the inputs. + square_bottom_vec_.clear(); + square_top_vec_.clear(); + square_bottom_vec_.push_back(&square_input_); + square_top_vec_.push_back(&square_output_); + LayerParameter square_param; + square_param.mutable_power_param()->set_power(Dtype(2)); + square_layer_.reset(new PowerLayer(square_param)); + square_layer_->SetUp(square_bottom_vec_, square_top_vec_); + // Set up pool_layer_ to sum over square neighborhoods of the input. + pool_top_vec_.clear(); + pool_top_vec_.push_back(&pool_output_); + LayerParameter pool_param; + pool_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + pool_param.mutable_pooling_param()->set_pad(pre_pad_); + pool_param.mutable_pooling_param()->set_kernel_size(size_); + pool_layer_.reset(new PoolingLayer(pool_param)); + pool_layer_->SetUp(square_top_vec_, pool_top_vec_); + // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is + // the sum of a squared neighborhood (the output of pool_layer_). + power_top_vec_.clear(); + power_top_vec_.push_back(&power_output_); + LayerParameter power_param; + power_param.mutable_power_param()->set_power(-beta_); + power_param.mutable_power_param()->set_scale(alpha_); + power_param.mutable_power_param()->set_shift(Dtype(1)); + power_layer_.reset(new PowerLayer(power_param)); + power_layer_->SetUp(pool_top_vec_, power_top_vec_); + // Set up a product_layer_ to compute outputs by multiplying inputs by the + // inverse demoninator computed by the power layer. + product_bottom_vec_.clear(); + product_bottom_vec_.push_back(&product_input_); + product_bottom_vec_.push_back(&power_output_); + LayerParameter product_param; + EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param(); + eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD); + product_layer_.reset(new EltwiseLayer(product_param)); + product_layer_->SetUp(product_bottom_vec_, top); + } } template void LRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - num_ = bottom[0]->num(); - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - top[0]->Reshape(num_, channels_, height_, width_); - scale_.Reshape(num_, channels_, height_, width_); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - split_layer_->Reshape(bottom, split_top_vec_); - square_layer_->Reshape(square_bottom_vec_, square_top_vec_); - pool_layer_->Reshape(square_top_vec_, pool_top_vec_); - power_layer_->Reshape(pool_top_vec_, power_top_vec_); - product_layer_->Reshape(product_bottom_vec_, top); - break; - } + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + num_ = bottom[0]->num(); + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + top[0]->Reshape(num_, channels_, height_, width_); + scale_.Reshape(num_, channels_, height_, width_); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + split_layer_->Reshape(bottom, split_top_vec_); + square_layer_->Reshape(square_bottom_vec_, square_top_vec_); + pool_layer_->Reshape(square_top_vec_, pool_top_vec_); + power_layer_->Reshape(pool_top_vec_, power_top_vec_); + product_layer_->Reshape(product_bottom_vec_, top); + break; + } } template void LRNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_cpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_cpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } template void LRNLayer::CrossChannelForward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); - // start with the constant value - for (int i = 0; i < scale_.count(); ++i) { - scale_data[i] = k_; - } - Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_); - Dtype* padded_square_data = padded_square.mutable_cpu_data(); - caffe_set(padded_square.count(), Dtype(0), padded_square_data); - Dtype alpha_over_size = alpha_ / size_; - // go through the images - for (int n = 0; n < num_; ++n) { - // compute the padded square - caffe_sqr(channels_ * height_ * width_, - bottom_data + bottom[0]->offset(n), - padded_square_data + padded_square.offset(0, pre_pad_)); - // Create the first channel scale - for (int c = 0; c < size_; ++c) { - caffe_axpy < Dtype > (height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c), - scale_data + scale_.offset(n, 0)); - } - for (int c = 1; c < channels_; ++c) { - // copy previous scale - caffe_copy < Dtype > (height_ * width_, - scale_data + scale_.offset(n, c - 1), - scale_data + scale_.offset(n, c)); - // add head - caffe_axpy < Dtype > (height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c + size_ - 1), - scale_data + scale_.offset(n, c)); - // subtract tail - caffe_axpy < Dtype > (height_ * width_, -alpha_over_size, - padded_square_data + padded_square.offset(0, c - 1), - scale_data + scale_.offset(n, c)); - } - } + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + Dtype* scale_data = scale_.mutable_cpu_data(); + // start with the constant value + for (int i = 0; i < scale_.count(); ++i) { + scale_data[i] = k_; + } + Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_); + Dtype* padded_square_data = padded_square.mutable_cpu_data(); + caffe_set(padded_square.count(), Dtype(0), padded_square_data); + Dtype alpha_over_size = alpha_ / size_; + // go through the images + for (int n = 0; n < num_; ++n) { + // compute the padded square + caffe_sqr(channels_ * height_ * width_, bottom_data + bottom[0]->offset(n), + padded_square_data + padded_square.offset(0, pre_pad_)); + // Create the first channel scale + for (int c = 0; c < size_; ++c) { + caffe_axpy < Dtype + > (height_ * width_, alpha_over_size, padded_square_data + + padded_square.offset(0, c), scale_data + scale_.offset(n, 0)); + } + for (int c = 1; c < channels_; ++c) { + // copy previous scale + caffe_copy < Dtype + > (height_ * width_, scale_data + scale_.offset(n, c - 1), scale_data + + scale_.offset(n, c)); + // add head + caffe_axpy < Dtype + > (height_ * width_, alpha_over_size, padded_square_data + + padded_square.offset(0, c + size_ - 1), scale_data + + scale_.offset(n, c)); + // subtract tail + caffe_axpy < Dtype + > (height_ * width_, -alpha_over_size, padded_square_data + + padded_square.offset(0, c - 1), scale_data + scale_.offset(n, c)); + } + } - // In the end, compute output - caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data); - caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data); + // In the end, compute output + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data); + caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data); } template -void LRNLayer::WithinChannelForward( - const vector*>& bottom, const vector*>& top) { - split_layer_->Forward(bottom, split_top_vec_); - square_layer_->Forward(square_bottom_vec_, square_top_vec_); - pool_layer_->Forward(square_top_vec_, pool_top_vec_); - power_layer_->Forward(pool_top_vec_, power_top_vec_); - product_layer_->Forward(product_bottom_vec_, top); +void LRNLayer::WithinChannelForward(const vector*>& bottom, + const vector*>& top) { + split_layer_->Forward(bottom, split_top_vec_); + square_layer_->Forward(square_bottom_vec_, square_top_vec_); + pool_layer_->Forward(square_top_vec_, pool_top_vec_); + power_layer_->Forward(pool_top_vec_, power_top_vec_); + product_layer_->Forward(product_bottom_vec_, top); } template void LRNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_cpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_cpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } template -void LRNLayer::CrossChannelBackward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* scale_data = scale_.cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_); - Blob < Dtype > accum_ratio(1, 1, height_, width_); - Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); - Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); - // We hack a little bit by using the diff() to store an additional result - Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); - caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); - Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; +void LRNLayer::CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* scale_data = scale_.cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_); + Blob < Dtype > accum_ratio(1, 1, height_, width_); + Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); + Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); + // We hack a little bit by using the diff() to store an additional result + Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); + caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); + Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; - caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff); - caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff); + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff); + caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff); - // go through individual data - int inverse_pre_pad = size_ - (size_ + 1) / 2; - for (int n = 0; n < num_; ++n) { - int block_offset = scale_.offset(n); - // first, compute diff_i * y_i / s_i - caffe_mul < Dtype > (channels_ * height_ * width_, - top_diff + block_offset, top_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); - caffe_div < Dtype > (channels_ * height_ * width_, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), - scale_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); - // Now, compute the accumulated ratios and the bottom diff - caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); - for (int c = 0; c < size_ - 1; ++c) { - caffe_axpy < Dtype > (height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); - } - for (int c = 0; c < channels_; ++c) { - caffe_axpy < Dtype > (height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), - accum_ratio_data); - // compute bottom diff - caffe_mul < Dtype > (height_ * width_, - bottom_data + top[0]->offset(n, c), - accum_ratio_data, accum_ratio_times_bottom); - caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value, - accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); - caffe_axpy < Dtype > (height_ * width_, -1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); - } - } + // go through individual data + int inverse_pre_pad = size_ - (size_ + 1) / 2; + for (int n = 0; n < num_; ++n) { + int block_offset = scale_.offset(n); + // first, compute diff_i * y_i / s_i + caffe_mul < Dtype + > (channels_ * height_ * width_, top_diff + block_offset, top_data + + block_offset, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad)); + caffe_div < Dtype + > (channels_ * height_ * width_, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad), scale_data + + block_offset, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad)); + // Now, compute the accumulated ratios and the bottom diff + caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); + for (int c = 0; c < size_ - 1; ++c) { + caffe_axpy < Dtype + > (height_ * width_, 1., padded_ratio_data + + padded_ratio.offset(0, c), accum_ratio_data); + } + for (int c = 0; c < channels_; ++c) { + caffe_axpy < Dtype + > (height_ * width_, 1., padded_ratio_data + + padded_ratio.offset(0, c + size_ - 1), accum_ratio_data); + // compute bottom diff + caffe_mul < Dtype + > (height_ * width_, bottom_data + top[0]->offset(n, c), accum_ratio_data, accum_ratio_times_bottom); + caffe_axpy < Dtype + > (height_ * width_, -cache_ratio_value, accum_ratio_times_bottom, bottom_diff + + top[0]->offset(n, c)); + caffe_axpy < Dtype + > (height_ * width_, -1., padded_ratio_data + + padded_ratio.offset(0, c), accum_ratio_data); + } + } } template -void LRNLayer::WithinChannelBackward( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - vector product_propagate_down(2, true); - product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); - power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); - pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); - square_layer_->Backward(square_top_vec_, propagate_down, - square_bottom_vec_); - split_layer_->Backward(split_top_vec_, propagate_down, bottom); - } +void LRNLayer::WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + vector product_propagate_down(2, true); + product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); + power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); + pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); + square_layer_->Backward(square_top_vec_, propagate_down, + square_bottom_vec_); + split_layer_->Backward(split_top_vec_, propagate_down, bottom); + } } template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); } template -void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); +void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); } template void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_gpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_gpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } template void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_gpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_gpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } } #ifdef CPU_ONLY STUB_GPU(LRNLayer); diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index e3b12908..eff0129c 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -10,109 +10,109 @@ namespace caffe { template void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - batch_size_ = this->layer_param_.memory_data_param().batch_size(); - channels_ = this->layer_param_.memory_data_param().channels(); - height_ = this->layer_param_.memory_data_param().height(); - width_ = this->layer_param_.memory_data_param().width(); - size_ = channels_ * height_ * width_; - CHECK_GT(batch_size_ * size_, 0) << - "batch_size, channels, height, and width must be specified and" - " positive in memory_data_param"; - vector label_shape(1, batch_size_); - top[0]->Reshape(batch_size_, channels_, height_, width_); - top[1]->Reshape(label_shape); - added_data_.Reshape(batch_size_, channels_, height_, width_); - added_label_.Reshape(label_shape); - data_ = NULL; - labels_ = NULL; - added_data_.cpu_data(); - added_label_.cpu_data(); + const vector*>& top) { + batch_size_ = this->layer_param_.memory_data_param().batch_size(); + channels_ = this->layer_param_.memory_data_param().channels(); + height_ = this->layer_param_.memory_data_param().height(); + width_ = this->layer_param_.memory_data_param().width(); + size_ = channels_ * height_ * width_; + CHECK_GT(batch_size_ * size_, 0) + << "batch_size, channels, height, and width must be specified and" + " positive in memory_data_param"; + vector label_shape(1, batch_size_); + top[0]->Reshape(batch_size_, channels_, height_, width_); + top[1]->Reshape(label_shape); + added_data_.Reshape(batch_size_, channels_, height_, width_); + added_label_.Reshape(label_shape); + data_ = NULL; + labels_ = NULL; + added_data_.cpu_data(); + added_label_.cpu_data(); } template void MemoryDataLayer::AddDatumVector(const vector& datum_vector) { - CHECK(!has_new_data_) << - "Can't add data until current data has been consumed."; - size_t num = datum_vector.size(); - CHECK_GT(num, 0) << "There is no datum to add."; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; - added_data_.Reshape(num, channels_, height_, width_); - added_label_.Reshape(num, 1, 1, 1); - // Apply data transformations (mirror, scale, crop...) - this->data_transformer_->Transform(datum_vector, &added_data_); - // Copy Labels - Dtype* top_label = added_label_.mutable_cpu_data(); - for (int item_id = 0; item_id < num; ++item_id) { - top_label[item_id] = datum_vector[item_id].label(); - } - // num_images == batch_size_ - Dtype* top_data = added_data_.mutable_cpu_data(); - Reset(top_data, top_label, num); - has_new_data_ = true; + CHECK(!has_new_data_) + << "Can't add data until current data has been consumed."; + size_t num = datum_vector.size(); + CHECK_GT(num, 0) << "There is no datum to add."; + CHECK_EQ(num % batch_size_, 0) + << "The added data must be a multiple of the batch size."; + added_data_.Reshape(num, channels_, height_, width_); + added_label_.Reshape(num, 1, 1, 1); + // Apply data transformations (mirror, scale, crop...) + this->data_transformer_->Transform(datum_vector, &added_data_); + // Copy Labels + Dtype* top_label = added_label_.mutable_cpu_data(); + for (int item_id = 0; item_id < num; ++item_id) { + top_label[item_id] = datum_vector[item_id].label(); + } + // num_images == batch_size_ + Dtype* top_data = added_data_.mutable_cpu_data(); + Reset(top_data, top_label, num); + has_new_data_ = true; } template void MemoryDataLayer::AddMatVector(const vector& mat_vector, - const vector& labels) { - size_t num = mat_vector.size(); - CHECK(!has_new_data_) << - "Can't add mat until current data has been consumed."; - CHECK_GT(num, 0) << "There is no mat to add"; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; - added_data_.Reshape(num, channels_, height_, width_); - added_label_.Reshape(num, 1, 1, 1); - // Apply data transformations (mirror, scale, crop...) - this->data_transformer_->Transform(mat_vector, &added_data_); - // Copy Labels - Dtype* top_label = added_label_.mutable_cpu_data(); - for (int item_id = 0; item_id < num; ++item_id) { - top_label[item_id] = labels[item_id]; - } - // num_images == batch_size_ - Dtype* top_data = added_data_.mutable_cpu_data(); - Reset(top_data, top_label, num); - has_new_data_ = true; + const vector& labels) { + size_t num = mat_vector.size(); + CHECK(!has_new_data_) + << "Can't add mat until current data has been consumed."; + CHECK_GT(num, 0) << "There is no mat to add"; + CHECK_EQ(num % batch_size_, 0) + << "The added data must be a multiple of the batch size."; + added_data_.Reshape(num, channels_, height_, width_); + added_label_.Reshape(num, 1, 1, 1); + // Apply data transformations (mirror, scale, crop...) + this->data_transformer_->Transform(mat_vector, &added_data_); + // Copy Labels + Dtype* top_label = added_label_.mutable_cpu_data(); + for (int item_id = 0; item_id < num; ++item_id) { + top_label[item_id] = labels[item_id]; + } + // num_images == batch_size_ + Dtype* top_data = added_data_.mutable_cpu_data(); + Reset(top_data, top_label, num); + has_new_data_ = true; } template void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { - CHECK(data); - CHECK(labels); - CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size"; - // Warn with transformation parameters since a memory array is meant to - // be generic and no transformations are done with Reset(). - if (this->layer_param_.has_transform_param()) { - LOG(WARNING) << this->type() << " does not transform array data on Reset()"; - } - data_ = data; - labels_ = labels; - n_ = n; - pos_ = 0; + CHECK(data); + CHECK(labels); + CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size"; + // Warn with transformation parameters since a memory array is meant to + // be generic and no transformations are done with Reset(). + if (this->layer_param_.has_transform_param()) { + LOG(WARNING) << this->type() << " does not transform array data on Reset()"; + } + data_ = data; + labels_ = labels; + n_ = n; + pos_ = 0; } template void MemoryDataLayer::set_batch_size(int new_size) { - CHECK(!has_new_data_) << - "Can't change batch_size until current data has been consumed."; - batch_size_ = new_size; - added_data_.Reshape(batch_size_, channels_, height_, width_); - added_label_.Reshape(batch_size_, 1, 1, 1); + CHECK(!has_new_data_) + << "Can't change batch_size until current data has been consumed."; + batch_size_ = new_size; + added_data_.Reshape(batch_size_, channels_, height_, width_); + added_label_.Reshape(batch_size_, 1, 1, 1); } template void MemoryDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; - top[0]->Reshape(batch_size_, channels_, height_, width_); - top[1]->Reshape(batch_size_, 1, 1, 1); - top[0]->set_cpu_data(data_ + pos_ * size_); - top[1]->set_cpu_data(labels_ + pos_); - pos_ = (pos_ + batch_size_) % n_; - if (pos_ == 0) - has_new_data_ = false; + const vector*>& top) { + CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; + top[0]->Reshape(batch_size_, channels_, height_, width_); + top[1]->Reshape(batch_size_, 1, 1, 1); + top[0]->set_cpu_data(data_ + pos_ * size_); + top[1]->set_cpu_data(labels_ + pos_); + pos_ = (pos_ + batch_size_) % n_; + if (pos_ == 0) + has_new_data_ = false; } INSTANTIATE_CLASS (MemoryDataLayer); diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 358ed891..4d8b69bc 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -12,53 +12,52 @@ namespace caffe { template void MultinomialLogisticLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::Reshape(bottom, top); - CHECK_EQ(bottom[1]->channels(), 1); - CHECK_EQ(bottom[1]->height(), 1); - CHECK_EQ(bottom[1]->width(), 1); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[1]->channels(), 1); + CHECK_EQ(bottom[1]->height(), 1); + CHECK_EQ(bottom[1]->width(), 1); } template void MultinomialLogisticLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - Dtype loss = 0; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); - loss -= log(prob); - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + Dtype loss = 0; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + Dtype prob = std::max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + loss -= log(prob); + } + top[0]->mutable_cpu_data()[0] = loss / num; } template void MultinomialLogisticLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - int num = bottom[0]->num(); - int dim = bottom[0]->count() / bottom[0]->num(); - caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); - const Dtype scale = -top[0]->cpu_diff()[0] / num; - for (int i = 0; i < num; ++i) { - int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); - bottom_diff[i * dim + label] = scale / prob; - } - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* bottom_label = bottom[1]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + int num = bottom[0]->num(); + int dim = bottom[0]->count() / bottom[0]->num(); + caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); + const Dtype scale = -top[0]->cpu_diff()[0] / num; + for (int i = 0; i < num; ++i) { + int label = static_cast(bottom_label[i]); + Dtype prob = std::max(bottom_data[i * dim + label], + Dtype(kLOG_THRESHOLD)); + bottom_diff[i * dim + label] = scale / prob; + } + } } INSTANTIATE_CLASS (MultinomialLogisticLossLayer); diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 0a6613d7..64c3063f 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -9,245 +9,223 @@ namespace caffe { template void MVNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - sum_multiplier_.Reshape(1, 1, - bottom[0]->height(), bottom[0]->width()); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); - caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); - eps_ = this->layer_param_.mvn_param().eps(); + const vector*>& top) { + top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(), + bottom[0]->width()); + mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(), + bottom[0]->width()); + sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width()); + Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); + eps_ = this->layer_param_.mvn_param().eps(); } template void MVNLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX - caffe_cpu_gemv < Dtype - > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), - sum_multiplier_.cpu_data(), 0., - variance_.mutable_cpu_data()); // E(X^2) - caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), - temp_.mutable_cpu_data()); // (EX)^2 - caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), - variance_.mutable_cpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); - - // normalize variance - caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), - variance_.mutable_cpu_data()); - - caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); - - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); - } else { - caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX - - // subtract mean - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_cpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), sum_multiplier_.cpu_data(), 0., variance_.mutable_cpu_data()); // E(X^2) + caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), + temp_.mutable_cpu_data()); // (EX)^2 + caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), + variance_.mutable_cpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); + + caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); + + // normalize variance + caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), + variance_.mutable_cpu_data()); + + caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); + + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); + + caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); + } else { + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + + // subtract mean + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); + + caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); + } } template void MVNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - bottom_diff); - caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., - bottom_diff); - - caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); - - caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); - } else { - caffe_copy(temp_.count(), top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., bottom_diff); + caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., bottom_diff); + + caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_powx(temp_.count(), bottom_data, Dtype(2), temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); + + caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); + } else { + caffe_copy(temp_.count(), top_diff, bottom_diff); + } } template void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - caffe_gpu_gemv < Dtype - > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) - caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 - caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } else { - caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - - // subtract mean - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), sum_multiplier_.gpu_data(), 0., variance_.mutable_gpu_data()); // E(X^2) + caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), + temp_.mutable_gpu_data()); // (EX)^2 + caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), + variance_.mutable_gpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + + // normalize variance + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + + // subtract mean + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + } } template void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., bottom_diff); + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., bottom_diff); + + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp index a9edeffd..4fa61aad 100644 --- a/src/caffe/layers/neuron_layer.cpp +++ b/src/caffe/layers/neuron_layer.cpp @@ -7,8 +7,8 @@ namespace caffe { template void NeuronLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - top[0]->ReshapeLike(*bottom[0]); + const vector*>& top) { + top[0]->ReshapeLike(*bottom[0]); } INSTANTIATE_CLASS (NeuronLayer); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 47830228..85c57379 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -15,397 +15,397 @@ using std::max; template void PoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - PoolingParameter pool_param = this->layer_param_.pooling_param(); - if (pool_param.global_pooling()) { - CHECK(!(pool_param.has_kernel_size() || - pool_param.has_kernel_h() || pool_param.has_kernel_w())) - << "With Global_pooling: true Filter size cannot specified"; - } else { - CHECK(!pool_param.has_kernel_size() != - !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(pool_param.has_kernel_size() || - (pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; - } - CHECK((!pool_param.has_pad() && pool_param.has_pad_h() - && pool_param.has_pad_w()) - || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) - << "pad is pad OR pad_h and pad_w are required."; - CHECK((!pool_param.has_stride() && pool_param.has_stride_h() - && pool_param.has_stride_w()) - || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) - << "Stride is stride OR stride_h and stride_w are required."; - global_pooling_ = pool_param.global_pooling(); - if (global_pooling_) { - kernel_h_ = bottom[0]->height(); - kernel_w_ = bottom[0]->width(); - } else { - if (pool_param.has_kernel_size()) { - kernel_h_ = kernel_w_ = pool_param.kernel_size(); - } else { - kernel_h_ = pool_param.kernel_h(); - kernel_w_ = pool_param.kernel_w(); - } - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - if (!pool_param.has_pad_h()) { - pad_h_ = pad_w_ = pool_param.pad(); - } else { - pad_h_ = pool_param.pad_h(); - pad_w_ = pool_param.pad_w(); - } - if (!pool_param.has_stride_h()) { - stride_h_ = stride_w_ = pool_param.stride(); - } else { - stride_h_ = pool_param.stride_h(); - stride_w_ = pool_param.stride_w(); - } - if (global_pooling_) { - CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_AVE - || this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_MAX) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_h_, kernel_h_); - CHECK_LT(pad_w_, kernel_w_); - } + const vector*>& top) { + PoolingParameter pool_param = this->layer_param_.pooling_param(); + if (pool_param.global_pooling()) { + CHECK( + !(pool_param.has_kernel_size() || pool_param.has_kernel_h() + || pool_param.has_kernel_w())) + << "With Global_pooling: true Filter size cannot specified"; + } else { + CHECK( + !pool_param.has_kernel_size() + != !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK( + pool_param.has_kernel_size() + || (pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + } + CHECK( + (!pool_param.has_pad() && pool_param.has_pad_h() && pool_param.has_pad_w()) + || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK( + (!pool_param.has_stride() && pool_param.has_stride_h() + && pool_param.has_stride_w()) + || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + global_pooling_ = pool_param.global_pooling(); + if (global_pooling_) { + kernel_h_ = bottom[0]->height(); + kernel_w_ = bottom[0]->width(); + } else { + if (pool_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = pool_param.kernel_size(); + } else { + kernel_h_ = pool_param.kernel_h(); + kernel_w_ = pool_param.kernel_w(); + } + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!pool_param.has_pad_h()) { + pad_h_ = pad_w_ = pool_param.pad(); + } else { + pad_h_ = pool_param.pad_h(); + pad_w_ = pool_param.pad_w(); + } + if (!pool_param.has_stride_h()) { + stride_h_ = stride_w_ = pool_param.stride(); + } else { + stride_h_ = pool_param.stride_h(); + stride_w_ = pool_param.stride_w(); + } + if (global_pooling_) { + CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + if (pad_h_ != 0 || pad_w_ != 0) { + CHECK( + this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_AVE + || this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_h_, kernel_h_); + CHECK_LT(pad_w_, kernel_w_); + } } template void PoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - height_ = bottom[0]->height(); - width_ = bottom[0]->width(); - if (global_pooling_) { - kernel_h_ = bottom[0]->height(); - kernel_w_ = bottom[0]->width(); - } - pooled_height_ = static_cast(ceil(static_cast( - height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; - pooled_width_ = static_cast(ceil(static_cast( - width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; - if (pad_h_ || pad_w_) { - // If we have padding, ensure that the last pooling starts strictly - // inside the image (instead of at the padding); otherwise clip the last. - if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) { - --pooled_height_; - } - if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) { - --pooled_width_; - } - CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_); - CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); - } - top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - if (top.size() > 1) { - top[1]->ReshapeLike(*top[0]); - } - // If max pooling, we will initialize the vector index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX && top.size() == 1) { - max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - } - // If stochastic pooling, we will initialize the random index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_STOCHASTIC) { - rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); - } + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + if (global_pooling_) { + kernel_h_ = bottom[0]->height(); + kernel_w_ = bottom[0]->width(); + } + pooled_height_ = static_cast(ceil( + static_cast(height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; + pooled_width_ = static_cast(ceil( + static_cast(width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; + if (pad_h_ || pad_w_) { + // If we have padding, ensure that the last pooling starts strictly + // inside the image (instead of at the padding); otherwise clip the last. + if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) { + --pooled_height_; + } + if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) { + --pooled_width_; + } + CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_); + CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); + } + top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_); + if (top.size() > 1) { + top[1]->ReshapeLike(*top[0]); + } + // If max pooling, we will initialize the vector index part. + if (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX && top.size() == 1) { + max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, + pooled_width_); + } + // If stochastic pooling, we will initialize the random index part. + if (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_STOCHASTIC) { + rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, + pooled_width_); + } } // TODO(Yangqing): Is there a faster way to do pooling in the channel-first // case? template void PoolingLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int top_count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; // suppress warnings about uninitalized variables - Dtype* top_mask = NULL; - // Different pooling methods. We explicitly do the switch outside the for - // loop to save time, although this results in more code. - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - // Initialize - if (use_top_mask) { - top_mask = top[1]->mutable_cpu_data(); - caffe_set(top_count, Dtype(-1), top_mask); - } else { - mask = max_idx_.mutable_cpu_data(); - caffe_set(top_count, -1, mask); - } - caffe_set(top_count, Dtype(-FLT_MAX), top_data); - // The main loop - for (int n = 0; n < bottom[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_); - int wend = min(wstart + kernel_w_, width_); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - const int pool_index = ph * pooled_width_ + pw; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = h * width_ + w; - if (bottom_data[index] > top_data[pool_index]) { - top_data[pool_index] = bottom_data[index]; - if (use_top_mask) { - top_mask[pool_index] = static_cast(index); - } else { - mask[pool_index] = index; - } - } - } - } - } - } - // compute offset - bottom_data += bottom[0]->offset(0, 1); - top_data += top[0]->offset(0, 1); - if (use_top_mask) { - top_mask += top[0]->offset(0, 1); - } else { - mask += top[0]->offset(0, 1); - } - } - } - break; - case PoolingParameter_PoolMethod_AVE: - for (int i = 0; i < top_count; ++i) { - top_data[i] = 0; - } - // The main loop - for (int n = 0; n < bottom[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_ + pad_h_); - int wend = min(wstart + kernel_w_, width_ + pad_w_); - int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height_); - wend = min(wend, width_); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - top_data[ph * pooled_width_ + pw] += - bottom_data[h * width_ + w]; - } - } - top_data[ph * pooled_width_ + pw] /= pool_size; - } - } - // compute offset - bottom_data += bottom[0]->offset(0, 1); - top_data += top[0]->offset(0, 1); - } - } - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - NOT_IMPLEMENTED; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int top_count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; // suppress warnings about uninitalized variables + Dtype* top_mask = NULL; + // Different pooling methods. We explicitly do the switch outside the for + // loop to save time, although this results in more code. + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + // Initialize + if (use_top_mask) { + top_mask = top[1]->mutable_cpu_data(); + caffe_set(top_count, Dtype(-1), top_mask); + } else { + mask = max_idx_.mutable_cpu_data(); + caffe_set(top_count, -1, mask); + } + caffe_set(top_count, Dtype(-FLT_MAX), top_data); + // The main loop + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_); + int wend = min(wstart + kernel_w_, width_); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const int pool_index = ph * pooled_width_ + pw; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width_ + w; + if (bottom_data[index] > top_data[pool_index]) { + top_data[pool_index] = bottom_data[index]; + if (use_top_mask) { + top_mask[pool_index] = static_cast(index); + } else { + mask[pool_index] = index; + } + } + } + } + } + } + // compute offset + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + if (use_top_mask) { + top_mask += top[0]->offset(0, 1); + } else { + mask += top[0]->offset(0, 1); + } + } + } + break; + case PoolingParameter_PoolMethod_AVE: + for (int i = 0; i < top_count; ++i) { + top_data[i] = 0; + } + // The main loop + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_ + pad_h_); + int wend = min(wstart + kernel_w_, width_ + pad_w_); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height_); + wend = min(wend, width_); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + top_data[ph * pooled_width_ + pw] += + bottom_data[h * width_ + w]; + } + } + top_data[ph * pooled_width_ + pw] /= pool_size; + } + } + // compute offset + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + } + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + NOT_IMPLEMENTED; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } template void PoolingLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - // Different pooling methods. We explicitly do the switch outside the for - // loop to save time, although this results in more codes. - caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; // suppress warnings about uninitialized variables - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - // The main loop - if (use_top_mask) { - top_mask = top[1]->cpu_data(); - } else { - mask = max_idx_.cpu_data(); - } - for (int n = 0; n < top[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - const int index = ph * pooled_width_ + pw; - const int bottom_index = - use_top_mask ? top_mask[index] : mask[index]; - bottom_diff[bottom_index] += top_diff[index]; - } - } - bottom_diff += bottom[0]->offset(0, 1); - top_diff += top[0]->offset(0, 1); - if (use_top_mask) { - top_mask += top[0]->offset(0, 1); - } else { - mask += top[0]->offset(0, 1); - } - } - } - break; - case PoolingParameter_PoolMethod_AVE: - // The main loop - for (int n = 0; n < top[0]->num(); ++n) { - for (int c = 0; c < channels_; ++c) { - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * stride_h_ - pad_h_; - int wstart = pw * stride_w_ - pad_w_; - int hend = min(hstart + kernel_h_, height_ + pad_h_); - int wend = min(wstart + kernel_w_, width_ + pad_w_); - int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height_); - wend = min(wend, width_); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - bottom_diff[h * width_ + w] += - top_diff[ph * pooled_width_ + pw] / pool_size; - } - } - } - } - // offset - bottom_diff += bottom[0]->offset(0, 1); - top_diff += top[0]->offset(0, 1); - } - } - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - NOT_IMPLEMENTED; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + // Different pooling methods. We explicitly do the switch outside the for + // loop to save time, although this results in more codes. + caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; // suppress warnings about uninitialized variables + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + // The main loop + if (use_top_mask) { + top_mask = top[1]->cpu_data(); + } else { + mask = max_idx_.cpu_data(); + } + for (int n = 0; n < top[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + const int index = ph * pooled_width_ + pw; + const int bottom_index = + use_top_mask ? top_mask[index] : mask[index]; + bottom_diff[bottom_index] += top_diff[index]; + } + } + bottom_diff += bottom[0]->offset(0, 1); + top_diff += top[0]->offset(0, 1); + if (use_top_mask) { + top_mask += top[0]->offset(0, 1); + } else { + mask += top[0]->offset(0, 1); + } + } + } + break; + case PoolingParameter_PoolMethod_AVE: + // The main loop + for (int n = 0; n < top[0]->num(); ++n) { + for (int c = 0; c < channels_; ++c) { + for (int ph = 0; ph < pooled_height_; ++ph) { + for (int pw = 0; pw < pooled_width_; ++pw) { + int hstart = ph * stride_h_ - pad_h_; + int wstart = pw * stride_w_ - pad_w_; + int hend = min(hstart + kernel_h_, height_ + pad_h_); + int wend = min(wstart + kernel_w_, width_ + pad_w_); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height_); + wend = min(wend, width_); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + bottom_diff[h * width_ + w] += top_diff[ph * pooled_width_ + pw] + / pool_size; + } + } + } + } + // offset + bottom_diff += bottom[0]->offset(0, 1); + top_diff += top[0]->offset(0, 1); + } + } + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + NOT_IMPLEMENTED; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } // begin: code written/modified by AMD template void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - //Forward_cpu(bottom, top); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; - Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->mutable_gpu_data(); - } else { - mask = max_idx_.mutable_gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - if (this->phase_ == TRAIN) { - // We need to create the random index as well. - caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); - } else { - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); - } - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector*>& top) { + //Forward_cpu(bottom, top); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; + Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->mutable_gpu_data(); + } else { + mask = max_idx_.mutable_gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, + width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, top_data, mask, top_mask); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, + width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, top_data); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + if (this->phase_ == TRAIN) { + // We need to create the random index as well. + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, rand_idx_.mutable_gpu_data(), top_data); + } else { + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, top_data); + } + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } template void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - //Backward_cpu(top, propagate_down, bottom); - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - caffe_gpu_set(count, Dtype(0.), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->gpu_data(); - } else { - mask = max_idx_.gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward(count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } + const vector& propagate_down, const vector*>& bottom) { + //Backward_cpu(top, propagate_down, bottom); + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + caffe_gpu_set(count, Dtype(0.), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->gpu_data(); + } else { + mask = max_idx_.gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolBackward(count, top_diff, top[0]->num(), channels_, height_, width_, + pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, top[0]->num(), + channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } } // end: code written/modified by AMD diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 0cf82c35..6b2c5f1d 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -11,163 +11,162 @@ namespace caffe { template void PowerLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - power_ = this->layer_param_.power_param().power(); - scale_ = this->layer_param_.power_param().scale(); - shift_ = this->layer_param_.power_param().shift(); - diff_scale_ = power_ * scale_; + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + power_ = this->layer_param_.power_param().power(); + scale_ = this->layer_param_.power_param().scale(); + shift_ = this->layer_param_.power_param().shift(); + diff_scale_ = power_ * scale_; } // Compute y = (shift + scale * x)^power template void PowerLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->cpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_powx(count, top_data, power_, top_data); - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + caffe_set(count, value, top_data); + return; + } + const Dtype* bottom_data = bottom[0]->cpu_data(); + caffe_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_powx(count, top_data, power_, top_data); + } } template void PowerLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->cpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->cpu_data(); - caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_scal(count, diff_scale_, bottom_diff); - } - } - } - if (diff_scale_ != Dtype(0)) { - caffe_mul(count, top_diff, bottom_diff, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->cpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + caffe_set(count, diff_scale_, bottom_diff); + } else { + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), + bottom_diff); + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->cpu_data(); + caffe_div(count, top_data, bottom_data, bottom_diff); + caffe_scal(count, power_, bottom_diff); + } else { + caffe_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->cpu_data(); + caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_scal(count, diff_scale_, bottom_diff); + } + } + } + if (diff_scale_ != Dtype(0)) { + caffe_mul(count, top_diff, bottom_diff, bottom_diff); + } + } } // begin: code written/modified by AMD template void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - ocl_memset(top_data, value, count); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_gpu_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } + const vector*>& top) { + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + ocl_memset(top_data, value, count); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + caffe_gpu_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_gpu_powx(count, top_data, power_, top_data); + } } template void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - ocl_memset(bottom_diff, diff_scale_, count); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + ocl_memset(bottom_diff, diff_scale_, count); + } else { + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), + bottom_diff); + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + caffe_gpu_scal(count, power_, bottom_diff); + } else { + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_gpu_scal(count, diff_scale_, bottom_diff); + } + } + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); + } } // end: code written/modified by AMD #ifdef CPU_ONLY diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index cbf7f064..8ec6664d 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -9,197 +9,192 @@ namespace caffe { template void PReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; - PReLUParameter prelu_param = this->layer_param().prelu_param(); - int channels = bottom[0]->channels(); - channel_shared_ = prelu_param.channel_shared(); - if (this->blobs_.size() > 0) { - LOG(INFO) << "Skipping parameter initialization"; - } else { - this->blobs_.resize(1); - if (channel_shared_) { - this->blobs_[0].reset(new Blob(vector(0))); - } else { - this->blobs_[0].reset(new Blob(vector(1, channels))); - } - shared_ptr < Filler > filler; - if (prelu_param.has_filler()) { - filler.reset(GetFiller < Dtype > (prelu_param.filler())); - } else { - FillerParameter filler_param; - filler_param.set_type("constant"); - filler_param.set_value(0.25); - filler.reset(GetFiller < Dtype > (filler_param)); - } - filler->Fill(this->blobs_[0].get()); - } - if (channel_shared_) { - CHECK_EQ(this->blobs_[0]->count(), 1) - << "Negative slope size is inconsistent with prototxt config"; - } else { - CHECK_EQ(this->blobs_[0]->count(), channels) - << "Negative slope size is inconsistent with prototxt config"; - } - - // Propagate gradients to the parameters (as directed by backward pass). - this->param_propagate_down_.resize(this->blobs_.size(), true); - multiplier_.Reshape(vector(1, bottom[0]->count(1))); - backward_buff_.Reshape(vector(1, bottom[0]->count(1))); - caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data()); + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "Number of axes of bottom blob must be >=2."; + PReLUParameter prelu_param = this->layer_param().prelu_param(); + int channels = bottom[0]->channels(); + channel_shared_ = prelu_param.channel_shared(); + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + this->blobs_.resize(1); + if (channel_shared_) { + this->blobs_[0].reset(new Blob(vector(0))); + } else { + this->blobs_[0].reset(new Blob(vector(1, channels))); + } + shared_ptr < Filler > filler; + if (prelu_param.has_filler()) { + filler.reset(GetFiller < Dtype > (prelu_param.filler())); + } else { + FillerParameter filler_param; + filler_param.set_type("constant"); + filler_param.set_value(0.25); + filler.reset(GetFiller < Dtype > (filler_param)); + } + filler->Fill(this->blobs_[0].get()); + } + if (channel_shared_) { + CHECK_EQ(this->blobs_[0]->count(), 1) + << "Negative slope size is inconsistent with prototxt config"; + } else { + CHECK_EQ(this->blobs_[0]->count(), channels) + << "Negative slope size is inconsistent with prototxt config"; + } + + // Propagate gradients to the parameters (as directed by backward pass). + this->param_propagate_down_.resize(this->blobs_.size(), true); + multiplier_.Reshape(vector(1, bottom[0]->count(1))); + backward_buff_.Reshape(vector(1, bottom[0]->count(1))); + caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data()); } template void PReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom[0]->num_axes(), 2) - << "Number of axes of bottom blob must be >=2."; - top[0]->ReshapeLike(*bottom[0]); - if (bottom[0] == top[0]) { - // For in-place computation - bottom_memory_.ReshapeLike(*bottom[0]); - } + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "Number of axes of bottom blob must be >=2."; + top[0]->ReshapeLike(*bottom[0]); + if (bottom[0] == top[0]) { + // For in-place computation + bottom_memory_.ReshapeLike(*bottom[0]); + } } template void PReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->cpu_data(); - - // For in-place computation - if (bottom[0] == top[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data()); - } - - // if channel_shared, channel index in the following computation becomes - // always zero. - const int div_factor = channel_shared_ ? channels : 1; - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - top_data[i] = std::max(bottom_data[i], Dtype(0)) - + slope_data[c] * std::min(bottom_data[i], Dtype(0)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->cpu_data(); + + // For in-place computation + if (bottom[0] == top[0]) { + caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data()); + } + + // if channel_shared, channel index in the following computation becomes + // always zero. + const int div_factor = channel_shared_ ? channels : 1; + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + top_data[i] = std::max(bottom_data[i], Dtype(0)) + + slope_data[c] * std::min(bottom_data[i], Dtype(0)); + } } template void PReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* slope_data = this->blobs_[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - // For in-place computation - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.cpu_data(); - } - - // if channel_shared, channel index in the following computation becomes - // always zero. - const int div_factor = channel_shared_ ? channels : 1; - - // Propagte to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff(); - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int i = 0; i < count; ++i) { - int c = (i / dim) % channels / div_factor; - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + slope_data[c] * (bottom_data[i] <= 0)); - } - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* slope_data = this->blobs_[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + + // For in-place computation + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.cpu_data(); + } + + // if channel_shared, channel index in the following computation becomes + // always zero. + const int div_factor = channel_shared_ ? channels : 1; + + // Propagte to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff(); + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int i = 0; i < count; ++i) { + int c = (i / dim) % channels / div_factor; + bottom_diff[i] = top_diff[i] + * ((bottom_data[i] > 0) + slope_data[c] * (bottom_data[i] <= 0)); + } + } } template void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - const int div_factor = channel_shared_ ? channels : 1; - - if (top[0] == bottom[0]) { - caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); - } - PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, - div_factor); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + const int div_factor = channel_shared_ ? channels : 1; + + if (top[0] == bottom[0]) { + caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + } + PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, + div_factor); } template void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.gpu_data(); - } - - // Propagate to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); - int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward( - cdim, top_diff, top[0]->offset(n), - bottom_data, bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - if (channel_shared_) { - Dtype d; - caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } - if (channel_shared_) { - caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - int div_factor = channel_shared_ ? channels : 1; - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, - slope_data, - div_factor); - } + const vector& propagate_down, const vector*>& bottom) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.gpu_data(); + } + + // Propagate to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); + int cdim = channels * dim; + Dtype dsum = 0.; + for (int n = 0; n < bottom[0]->num(); ++n) { + // compute element-wise diff + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUParamBackward(cdim, top_diff, top[0]->offset(n), bottom_data, + bottom[0]->offset(n), backward_buff_.mutable_gpu_diff()); + if (channel_shared_) { + Dtype d; + caffe_gpu_dot < Dtype + > (channels * dim, backward_buff_.gpu_diff(), multiplier_.gpu_data(), &d); + dsum += d; + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, channels, dim, 1., backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., slope_diff); + } + } + if (channel_shared_) { + caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + int div_factor = channel_shared_ ? channels : 1; + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, + slope_data, div_factor); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index ddf70e46..89df6589 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -10,201 +10,201 @@ namespace caffe { template void ReductionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - op_ = this->layer_param_.reduction_param().operation(); + const vector*>& top) { + op_ = this->layer_param_.reduction_param().operation(); } template void ReductionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - axis_ = bottom[0]->CanonicalAxisIndex( - this->layer_param_.reduction_param().axis()); - // In the output, we'll keep all axes up to the reduction axis, but - // throw away any after that. - // Note: currently reducing along non-tail axes is not supported; otherwise, - // we'd need to also copy any axes following an "end_axis". - vector top_shape(bottom[0]->shape().begin(), - bottom[0]->shape().begin() + axis_); - top[0]->Reshape(top_shape); - num_ = bottom[0]->count(0, axis_); - dim_ = bottom[0]->count(axis_); - CHECK_EQ(num_, top[0]->count()); - if (op_ == ReductionParameter_ReductionOp_SUM || - op_ == ReductionParameter_ReductionOp_MEAN) { - vector sum_mult_shape(1, dim_); - sum_multiplier_.Reshape(sum_mult_shape); - caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); - } - coeff_ = this->layer_param().reduction_param().coeff(); - if (op_ == ReductionParameter_ReductionOp_MEAN) { - coeff_ /= dim_; - } + const vector*>& top) { + axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.reduction_param().axis()); + // In the output, we'll keep all axes up to the reduction axis, but + // throw away any after that. + // Note: currently reducing along non-tail axes is not supported; otherwise, + // we'd need to also copy any axes following an "end_axis". + vector top_shape(bottom[0]->shape().begin(), + bottom[0]->shape().begin() + axis_); + top[0]->Reshape(top_shape); + num_ = bottom[0]->count(0, axis_); + dim_ = bottom[0]->count(axis_); + CHECK_EQ(num_, top[0]->count()); + if (op_ == ReductionParameter_ReductionOp_SUM + || op_ == ReductionParameter_ReductionOp_MEAN) { + vector sum_mult_shape(1, dim_); + sum_multiplier_.Reshape(sum_mult_shape); + caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); + } + coeff_ = this->layer_param().reduction_param().coeff(); + if (op_ == ReductionParameter_ReductionOp_MEAN) { + coeff_ /= dim_; + } } template -void ReductionLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.cpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data); - break; - case ReductionParameter_ReductionOp_ASUM: - *top_data = caffe_cpu_asum(dim_, bottom_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_cpu_data(); - caffe_scal(num_, coeff_, top_data); - } +void ReductionLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.cpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data); + break; + case ReductionParameter_ReductionOp_ASUM: + *top_data = caffe_cpu_asum(dim_, bottom_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_cpu_data(); + caffe_scal(num_, coeff_, top_data); + } } template void ReductionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->cpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_cpu_sign(dim_, bottom_data, bottom_diff); - caffe_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->cpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_set(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_cpu_sign(dim_, bottom_data, bottom_diff); + caffe_scal(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + bottom_diff += dim_; + ++top_diff; + } } template void ReductionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.gpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_asum(dim_, bottom_data, top_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_gpu_data(); + caffe_gpu_scal(num_, coeff_, top_data); + } } template void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->gpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->gpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_set(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_sign(dim_, bottom_data, bottom_diff); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data += dim_; + bottom_diff += dim_; + ++top_diff; + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 334dc244..b07e6447 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -7,56 +7,54 @@ namespace caffe { template void ReLULayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - for (int i = 0; i < count; ++i) { - top_data[i] = std::max(bottom_data[i], Dtype(0)) - + negative_slope * std::min(bottom_data[i], Dtype(0)); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + for (int i = 0; i < count; ++i) { + top_data[i] = std::max(bottom_data[i], Dtype(0)) + + negative_slope * std::min(bottom_data[i], Dtype(0)); + } } template void ReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - for (int i = 0; i < count; ++i) { - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + negative_slope * (bottom_data[i] <= 0)); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + for (int i = 0; i < count; ++i) { + bottom_diff[i] = top_diff[i] + * ((bottom_data[i] > 0) + negative_slope * (bottom_data[i] <= 0)); + } + } } template void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUForward(count, bottom_data, top_data, negative_slope); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUForward(count, bottom_data, top_data, negative_slope); } template void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp index 094e61ef..a2377d87 100644 --- a/src/caffe/layers/reshape_layer.cpp +++ b/src/caffe/layers/reshape_layer.cpp @@ -7,87 +7,87 @@ namespace caffe { template void ReshapeLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - inferred_axis_ = -1; - copy_axes_.clear(); - const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); - const int top_num_axes = top_blob_shape.dim_size(); - constant_count_ = 1; - for (int i = 0; i < top_num_axes; ++i) { - const int top_dim = top_blob_shape.dim(i); - if (top_dim == 0) { - copy_axes_.push_back(i); - } else if (top_dim == -1) { - CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple " - << "-1 dims; at most a single (1) value of -1 may be specified"; - inferred_axis_ = i; - } else { - constant_count_ *= top_dim; - } - } + const vector*>& top) { + inferred_axis_ = -1; + copy_axes_.clear(); + const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); + const int top_num_axes = top_blob_shape.dim_size(); + constant_count_ = 1; + for (int i = 0; i < top_num_axes; ++i) { + const int top_dim = top_blob_shape.dim(i); + if (top_dim == 0) { + copy_axes_.push_back(i); + } else if (top_dim == -1) { + CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple " + << "-1 dims; at most a single (1) value of -1 may be specified"; + inferred_axis_ = i; + } else { + constant_count_ *= top_dim; + } + } } template void ReshapeLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int input_start_axis = this->layer_param_.reshape_param().axis(); - const int start_axis = - (input_start_axis >= 0) ? input_start_axis : - bottom[0]->num_axes() + input_start_axis + 1; - CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; - CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis - << " out of range for " << bottom[0]->num_axes() << "-D input blob"; - const int num_axes = this->layer_param_.reshape_param().num_axes(); - CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all"; - const int end_axis = - (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); - CHECK_LE(end_axis, bottom[0]->num_axes()) - << "end_axis = axis + num_axes is out of range"; - const int num_axes_replaced = end_axis - start_axis; - const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced; - const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); - const int num_new_axes = top_blob_shape.dim_size(); - vector top_shape(num_axes_retained + num_new_axes); - int top_shape_index = 0; - for (int i = 0; i < start_axis; ++i) { - top_shape[top_shape_index++] = bottom[0]->shape(i); - } - for (int i = 0; i < num_new_axes; ++i) { - top_shape[top_shape_index++] = top_blob_shape.dim(i); - } - for (int i = end_axis; i < bottom[0]->num_axes(); ++i) { - top_shape[top_shape_index++] = bottom[0]->shape(i); - } - CHECK_EQ(top_shape_index, top_shape.size()); - for (int i = 0; i < copy_axes_.size(); ++i) { - const int copy_axis_index = copy_axes_[i]; - CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) - << "new shape contains a 0, but there was no corresponding bottom axis " - << "to copy"; - top_shape[start_axis + copy_axis_index] = - bottom[0]->shape(start_axis + copy_axis_index); - } - if (inferred_axis_ >= 0) { - // A -1 dim was specified; infer the correct dimension by computing the - // product of the other dimensions. - int explicit_count = constant_count_; - explicit_count *= bottom[0]->count(0, start_axis); - explicit_count *= bottom[0]->count(end_axis); - for (int i = 0; i < copy_axes_.size(); ++i) { - const int copy_axis_index = copy_axes_[i]; - explicit_count *= top_shape[start_axis + copy_axis_index]; - } - CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count (" - << bottom[0]->count() << ") must be divisible by the product of " - << "the specified dimensions (" << explicit_count << ")"; - const int inferred_dim = bottom[0]->count() / explicit_count; - top_shape[start_axis + inferred_axis_] = inferred_dim; - } - top[0]->Reshape(top_shape); - CHECK_EQ(top[0]->count(), bottom[0]->count()) - << "output count must match input count"; - top[0]->ShareData(*bottom[0]); - top[0]->ShareDiff(*bottom[0]); + const vector*>& top) { + const int input_start_axis = this->layer_param_.reshape_param().axis(); + const int start_axis = + (input_start_axis >= 0) ? + input_start_axis : bottom[0]->num_axes() + input_start_axis + 1; + CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; + CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis + << " out of range for " << bottom[0]->num_axes() << "-D input blob"; + const int num_axes = this->layer_param_.reshape_param().num_axes(); + CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all"; + const int end_axis = + (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes); + CHECK_LE(end_axis, bottom[0]->num_axes()) + << "end_axis = axis + num_axes is out of range"; + const int num_axes_replaced = end_axis - start_axis; + const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced; + const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape(); + const int num_new_axes = top_blob_shape.dim_size(); + vector top_shape(num_axes_retained + num_new_axes); + int top_shape_index = 0; + for (int i = 0; i < start_axis; ++i) { + top_shape[top_shape_index++] = bottom[0]->shape(i); + } + for (int i = 0; i < num_new_axes; ++i) { + top_shape[top_shape_index++] = top_blob_shape.dim(i); + } + for (int i = end_axis; i < bottom[0]->num_axes(); ++i) { + top_shape[top_shape_index++] = bottom[0]->shape(i); + } + CHECK_EQ(top_shape_index, top_shape.size()); + for (int i = 0; i < copy_axes_.size(); ++i) { + const int copy_axis_index = copy_axes_[i]; + CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) + << "new shape contains a 0, but there was no corresponding bottom axis " + << "to copy"; + top_shape[start_axis + copy_axis_index] = bottom[0]->shape( + start_axis + copy_axis_index); + } + if (inferred_axis_ >= 0) { + // A -1 dim was specified; infer the correct dimension by computing the + // product of the other dimensions. + int explicit_count = constant_count_; + explicit_count *= bottom[0]->count(0, start_axis); + explicit_count *= bottom[0]->count(end_axis); + for (int i = 0; i < copy_axes_.size(); ++i) { + const int copy_axis_index = copy_axes_[i]; + explicit_count *= top_shape[start_axis + copy_axis_index]; + } + CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count (" + << bottom[0]->count() << ") must be divisible by the product of " + << "the specified dimensions (" << explicit_count << ")"; + const int inferred_dim = bottom[0]->count() / explicit_count; + top_shape[start_axis + inferred_axis_] = inferred_dim; + } + top[0]->Reshape(top_shape); + CHECK_EQ(top[0]->count(), bottom[0]->count()) + << "output count must match input count"; + top[0]->ShareData(*bottom[0]); + top[0]->ShareDiff(*bottom[0]); } INSTANTIATE_CLASS (ReshapeLayer); diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 2a6d99e2..4048a8e8 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -10,87 +10,88 @@ namespace caffe { template void SigmoidCrossEntropyLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::LayerSetUp(bottom, top); - sigmoid_bottom_vec_.clear(); - sigmoid_bottom_vec_.push_back(bottom[0]); - sigmoid_top_vec_.clear(); - sigmoid_top_vec_.push_back(sigmoid_output_.get()); - sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + sigmoid_bottom_vec_.clear(); + sigmoid_bottom_vec_.push_back(bottom[0]); + sigmoid_top_vec_.clear(); + sigmoid_top_vec_.push_back(sigmoid_output_.get()); + sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_); } template void SigmoidCrossEntropyLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::Reshape(bottom, top); - CHECK_EQ(bottom[0]->count(), bottom[1]->count()) << - "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; - sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); + const vector*>& bottom, const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[0]->count(), bottom[1]->count()) + << "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; + sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); } template void SigmoidCrossEntropyLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // The forward pass computes the sigmoid outputs. - sigmoid_bottom_vec_[0] = bottom[0]; - sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_); - // Compute the loss (negative log likelihood) - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype loss = 0; - for (int i = 0; i < count; ++i) { - loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); - } - top[0]->mutable_cpu_data()[0] = loss / num; + const vector*>& bottom, const vector*>& top) { + // The forward pass computes the sigmoid outputs. + sigmoid_bottom_vec_[0] = bottom[0]; + sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_); + // Compute the loss (negative log likelihood) + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + // Stable version of loss computation from input data + const Dtype* input_data = bottom[0]->cpu_data(); + const Dtype* target = bottom[1]->cpu_data(); + Dtype loss = 0; + for (int i = 0; i < count; ++i) { + loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) + - log( + 1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); + } + top[0]->mutable_cpu_data()[0] = loss / num; } template void SigmoidCrossEntropyLossLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_sub(count, sigmoid_output_data, target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_scal(count, loss_weight / num, bottom_diff); - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); + const Dtype* target = bottom[1]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_sub(count, sigmoid_output_data, target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_scal(count, loss_weight / num, bottom_diff); + } } template void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = bottom[1]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); - } + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); + const Dtype* target = bottom[1]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); + caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_gpu_scal(count, loss_weight / num, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 833e1ced..a4359920 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -10,57 +10,56 @@ namespace caffe { template inline Dtype sigmoid(Dtype x) { - return 1. / (1. + exp(-x)); + return 1. / (1. + exp(-x)); } template void SigmoidLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = sigmoid(bottom_data[i]); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = sigmoid(bottom_data[i]); + } } template void SigmoidLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - const Dtype sigmoid_x = top_data[i]; - bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + const Dtype sigmoid_x = top_data[i]; + bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x); + } + } } template void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidForward(count, bottom_data, top_data); } template void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidBackward(count, top_diff, top_data, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidBackward(count, top_diff, top_data, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index 502d0aab..1c463499 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -8,30 +8,29 @@ namespace caffe { template void SilenceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_cpu_data()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_set(bottom[i]->count(), Dtype(0), bottom[i]->mutable_cpu_data()); + } + } } template void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Do nothing. + const vector*>& top) { + // Do nothing. } template void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); - } - } + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_gpu_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_gpu_data()); + } + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index a005ceba..da4059a0 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -9,117 +9,116 @@ namespace caffe { template void SliceLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - const SliceParameter& slice_param = this->layer_param_.slice_param(); - CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) - << "Either axis or slice_dim should be specified; not both."; - slice_point_.clear(); - std::copy(slice_param.slice_point().begin(), - slice_param.slice_point().end(), - std::back_inserter(slice_point_)); + const vector*>& top) { + const SliceParameter& slice_param = this->layer_param_.slice_param(); + CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) + << "Either axis or slice_dim should be specified; not both."; + slice_point_.clear(); + std::copy(slice_param.slice_point().begin(), slice_param.slice_point().end(), + std::back_inserter(slice_point_)); } template void SliceLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - const int num_axes = bottom[0]->num_axes(); - const SliceParameter& slice_param = this->layer_param_.slice_param(); - if (slice_param.has_slice_dim()) { - slice_axis_ = static_cast(slice_param.slice_dim()); - // Don't allow negative indexing for slice_dim, a uint32 -- almost - // certainly unintended. - CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 " - << "produced negative result; slice_dim must satisfy " - << "0 <= slice_dim < " << kMaxBlobAxes; - CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range."; - } else { - slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis()); - } - vector top_shape = bottom[0]->shape(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - num_slices_ = bottom[0]->count(0, slice_axis_); - slice_size_ = bottom[0]->count(slice_axis_ + 1); - int count = 0; - if (slice_point_.size() != 0) { - CHECK_EQ(slice_point_.size(), top.size() - 1); - CHECK_LE(top.size(), bottom_slice_axis); - int prev = 0; - vector slices; - for (int i = 0; i < slice_point_.size(); ++i) { - CHECK_GT(slice_point_[i], prev); - slices.push_back(slice_point_[i] - prev); - prev = slice_point_[i]; - } - slices.push_back(bottom_slice_axis - prev); - for (int i = 0; i < top.size(); ++i) { - top_shape[slice_axis_] = slices[i]; - top[i]->Reshape(top_shape); - count += top[i]->count(); - } - } else { - CHECK_EQ(bottom_slice_axis % top.size(), 0) - << "Number of top blobs (" << top.size() << ") should evenly " - << "divide input slice axis (" << bottom_slice_axis << ")"; - top_shape[slice_axis_] = bottom_slice_axis / top.size(); - for (int i = 0; i < top.size(); ++i) { - top[i]->Reshape(top_shape); - count += top[i]->count(); - } - } - CHECK_EQ(count, bottom[0]->count()); + const vector*>& top) { + const int num_axes = bottom[0]->num_axes(); + const SliceParameter& slice_param = this->layer_param_.slice_param(); + if (slice_param.has_slice_dim()) { + slice_axis_ = static_cast(slice_param.slice_dim()); + // Don't allow negative indexing for slice_dim, a uint32 -- almost + // certainly unintended. + CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 " + << "produced negative result; slice_dim must satisfy " + << "0 <= slice_dim < " << kMaxBlobAxes; + CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range."; + } else { + slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis()); + } + vector top_shape = bottom[0]->shape(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + num_slices_ = bottom[0]->count(0, slice_axis_); + slice_size_ = bottom[0]->count(slice_axis_ + 1); + int count = 0; + if (slice_point_.size() != 0) { + CHECK_EQ(slice_point_.size(), top.size() - 1); + CHECK_LE(top.size(), bottom_slice_axis); + int prev = 0; + vector slices; + for (int i = 0; i < slice_point_.size(); ++i) { + CHECK_GT(slice_point_[i], prev); + slices.push_back(slice_point_[i] - prev); + prev = slice_point_[i]; + } + slices.push_back(bottom_slice_axis - prev); + for (int i = 0; i < top.size(); ++i) { + top_shape[slice_axis_] = slices[i]; + top[i]->Reshape(top_shape); + count += top[i]->count(); + } + } else { + CHECK_EQ(bottom_slice_axis % top.size(), 0) << "Number of top blobs (" + << top.size() << ") should evenly " << "divide input slice axis (" + << bottom_slice_axis << ")"; + top_shape[slice_axis_] = bottom_slice_axis / top.size(); + for (int i = 0; i < top.size(); ++i) { + top[i]->Reshape(top_shape); + count += top[i]->count(); + } + } + CHECK_EQ(count, bottom[0]->count()); } template void SliceLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - int offset_slice_axis = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - for (int i = 0; i < top.size(); ++i) { - Dtype* top_data = top[i]->mutable_cpu_data(); - const int top_slice_axis = top[i]->shape(slice_axis_); - for (int n = 0; n < num_slices_; ++n) { - const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - bottom_data + bottom_offset, top_data + top_offset); - } - offset_slice_axis += top_slice_axis; - } + const vector*>& top) { + int offset_slice_axis = 0; + const Dtype* bottom_data = bottom[0]->cpu_data(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + for (int i = 0; i < top.size(); ++i) { + Dtype* top_data = top[i]->mutable_cpu_data(); + const int top_slice_axis = top[i]->shape(slice_axis_); + for (int n = 0; n < num_slices_; ++n) { + const int top_offset = n * top_slice_axis * slice_size_; + const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis) + * slice_size_; + caffe_copy(top_slice_axis * slice_size_, bottom_data + bottom_offset, + top_data + top_offset); + } + offset_slice_axis += top_slice_axis; + } } template void SliceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - int offset_slice_axis = 0; - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - const int top_slice_axis = top[i]->shape(slice_axis_); - for (int n = 0; n < num_slices_; ++n) { - const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - top_diff + top_offset, bottom_diff + bottom_offset); - } - offset_slice_axis += top_slice_axis; - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + int offset_slice_axis = 0; + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + const int top_slice_axis = top[i]->shape(slice_axis_); + for (int n = 0; n < num_slices_; ++n) { + const int top_offset = n * top_slice_axis * slice_size_; + const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis) + * slice_size_; + caffe_copy(top_slice_axis * slice_size_, top_diff + top_offset, + bottom_diff + bottom_offset); + } + offset_slice_axis += top_slice_axis; + } } template void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { } template void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { } #ifdef CPU_ONLY diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index feb15321..92162821 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -9,19 +9,19 @@ namespace caffe { template void SoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); - top[0]->ReshapeLike(*bottom[0]); - vector mult_dims(1, bottom[0]->shape(softmax_axis_)); - sum_multiplier_.Reshape(mult_dims); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); - caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); - outer_num_ = bottom[0]->count(0, softmax_axis_); - inner_num_ = bottom[0]->count(softmax_axis_ + 1); - vector scale_dims = bottom[0]->shape(); - scale_dims[softmax_axis_] = 1; - scale_.Reshape(scale_dims); + const vector*>& top) { + softmax_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.softmax_param().axis()); + top[0]->ReshapeLike(*bottom[0]); + vector mult_dims(1, bottom[0]->shape(softmax_axis_)); + sum_multiplier_.Reshape(mult_dims); + Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); + outer_num_ = bottom[0]->count(0, softmax_axis_); + inner_num_ = bottom[0]->count(softmax_axis_ + 1); + vector scale_dims = bottom[0]->shape(); + scale_dims[softmax_axis_] = 1; + scale_.Reshape(scale_dims); } template @@ -30,122 +30,120 @@ SoftmaxLayer::~SoftmaxLayer() { template void SoftmaxLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); - int channels = bottom[0]->shape(softmax_axis_); - int dim = bottom[0]->count() / outer_num_; - caffe_copy(bottom[0]->count(), bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - for (int i = 0; i < outer_num_; ++i) { - // initialize scale_data to the first plane - caffe_copy(inner_num_, bottom_data + i * dim, scale_data); - for (int j = 0; j < channels; j++) { - for (int k = 0; k < inner_num_; k++) { - scale_data[k] = std::max(scale_data[k], - bottom_data[i * dim + j * inner_num_ + k]); - } - } - // subtraction - caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_, - 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); - // exponentiation - caffe_exp < Dtype > (dim, top_data, top_data); - // sum after exp - caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1., - top_data, sum_multiplier_.cpu_data(), 0., scale_data); - // division - for (int j = 0; j < channels; j++) { - caffe_div(inner_num_, top_data, scale_data, top_data); - top_data += inner_num_; - } - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + Dtype* scale_data = scale_.mutable_cpu_data(); + int channels = bottom[0]->shape(softmax_axis_); + int dim = bottom[0]->count() / outer_num_; + caffe_copy(bottom[0]->count(), bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + for (int i = 0; i < outer_num_; ++i) { + // initialize scale_data to the first plane + caffe_copy(inner_num_, bottom_data + i * dim, scale_data); + for (int j = 0; j < channels; j++) { + for (int k = 0; k < inner_num_; k++) { + scale_data[k] = std::max(scale_data[k], + bottom_data[i * dim + j * inner_num_ + k]); + } + } + // subtraction + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); + // exponentiation + caffe_exp < Dtype > (dim, top_data, top_data); + // sum after exp + caffe_cpu_gemv < Dtype + > (CblasTrans, channels, inner_num_, 1., top_data, sum_multiplier_.cpu_data(), 0., scale_data); + // division + for (int j = 0; j < channels; j++) { + caffe_div(inner_num_, top_data, scale_data, top_data); + top_data += inner_num_; + } + } } template void SoftmaxLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - Dtype* scale_data = scale_.mutable_cpu_data(); - int channels = top[0]->shape(softmax_axis_); - int dim = top[0]->count() / outer_num_; - caffe_copy(top[0]->count(), top_diff, bottom_diff); - for (int i = 0; i < outer_num_; ++i) { - // compute dot(top_diff, top_data) and subtract them from the bottom diff - for (int k = 0; k < inner_num_; ++k) { - scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels, - bottom_diff + i * dim + k, inner_num_, - top_data + i * dim + k, inner_num_); - } - // subtraction - caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, - -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff - + i * dim); - } - // elementwise multiplication - caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Dtype* scale_data = scale_.mutable_cpu_data(); + int channels = top[0]->shape(softmax_axis_); + int dim = top[0]->count() / outer_num_; + caffe_copy(top[0]->count(), top_diff, bottom_diff); + for (int i = 0; i < outer_num_; ++i) { + // compute dot(top_diff, top_data) and subtract them from the bottom diff + for (int k = 0; k < inner_num_; ++k) { + scale_data[k] = caffe_cpu_strided_dot < Dtype + > (channels, bottom_diff + i * dim + k, inner_num_, top_data + i * dim + + k, inner_num_); + } + // subtraction + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + + i * dim); + } + // elementwise multiplication + caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } // begin: code written/modified by AMD template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = bottom[0]->count(); - int channels = top[0]->shape(softmax_axis_); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = bottom[0]->count(); + int channels = top[0]->shape(softmax_axis_); - caffe_gpu_copy(count, bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - // compute max - // NOLINT_NEXT_LINE(whitespace/operators) + caffe_gpu_copy(count, bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + // compute max + // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data, - scale_data); - // subtract - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, top_data); - // exponentiate - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp < Dtype > (count, top_data, top_data); - // sum after exp - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data, - scale_data); - // divide - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, top_data); + kernel_channel_max < Dtype + > (outer_num_, channels, inner_num_, top_data, scale_data); + // subtract + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, top_data); + // exponentiate + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_exp < Dtype > (count, top_data, top_data); + // sum after exp + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_sum < Dtype + > (outer_num_, channels, inner_num_, top_data, scale_data); + // divide + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_div < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, top_data); } template void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = top[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_gpu_copy(count, top_diff, bottom_diff); - // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. - // NOLINT_NEXT_LINE(whitespace/operators) + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = top[0]->count(); + int channels = top[0]->shape(softmax_axis_); + caffe_gpu_copy(count, top_diff, bottom_diff); + // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. + // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); - // elementwise multiplication - caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); + kernel_channel_dot < Dtype + > (outer_num_, channels, inner_num_, top_diff, top_data, scale_data); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, bottom_diff); + // elementwise multiplication + caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); } // end: code written/modified by AMD diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 6b9e9e67..62c10e30 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -10,32 +10,31 @@ namespace caffe { template -void SoftmaxWithLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::LayerSetUp(bottom, top); - LayerParameter softmax_param(this->layer_param_); - softmax_param.set_type("Softmax"); - softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param); - softmax_bottom_vec_.clear(); - softmax_bottom_vec_.push_back(bottom[0]); - softmax_top_vec_.clear(); - softmax_top_vec_.push_back(&prob_); - softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); +void SoftmaxWithLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); + LayerParameter softmax_param(this->layer_param_); + softmax_param.set_type("Softmax"); + softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param); + softmax_bottom_vec_.clear(); + softmax_bottom_vec_.push_back(bottom[0]); + softmax_top_vec_.clear(); + softmax_top_vec_.push_back(&prob_); + softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); - has_ignore_label_ = - this->layer_param_.loss_param().has_ignore_label(); - if (has_ignore_label_) { - ignore_label_ = this->layer_param_.loss_param().ignore_label(); - } - normalize_ = this->layer_param_.loss_param().normalize(); + has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label(); + if (has_ignore_label_) { + ignore_label_ = this->layer_param_.loss_param().ignore_label(); + } + normalize_ = this->layer_param_.loss_param().normalize(); - ocl_setup(); + ocl_setup(); } template void SoftmaxWithLossLayer::ocl_setup() { - d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, - sizeof(Dtype), NULL, NULL); + d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + sizeof(Dtype), NULL, NULL); } @@ -44,160 +43,161 @@ SoftmaxWithLossLayer::~SoftmaxWithLossLayer() { } template -void SoftmaxWithLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer < Dtype > ::Reshape(bottom, top); - softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); - outer_num_ = bottom[0]->count(0, softmax_axis_); - inner_num_ = bottom[0]->count(softmax_axis_ + 1); - CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) - << "Number of labels must match number of predictions; " - << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " - << "label count (number of labels) must be N*H*W, " - << "with integer values in {0, 1, ..., C-1}."; - if (top.size() >= 2) { - // softmax output - top[1]->ReshapeLike(*bottom[0]); - } +void SoftmaxWithLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); + softmax_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.softmax_param().axis()); + outer_num_ = bottom[0]->count(0, softmax_axis_); + inner_num_ = bottom[0]->count(softmax_axis_ + 1); + CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) + << "Number of labels must match number of predictions; " + << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), " + << "label count (number of labels) must be N*H*W, " + << "with integer values in {0, 1, ..., C-1}."; + if (top.size() >= 2) { + // softmax output + top[1]->ReshapeLike(*bottom[0]); + } } template void SoftmaxWithLossLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { - // The forward pass computes the softmax prob values. - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.cpu_data(); - const Dtype* label = bottom[1]->cpu_data(); - int dim = prob_.count() / outer_num_; - int count = 0; - Dtype loss = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; j++) { - const int label_value = static_cast(label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - continue; - } - DCHECK_GE(label_value, 0); - DCHECK_LT(label_value, prob_.shape(softmax_axis_)); - loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j], - Dtype(FLT_MIN))); - ++count; - } - } - if (normalize_) { - top[0]->mutable_cpu_data()[0] = loss / count; - } else { - top[0]->mutable_cpu_data()[0] = loss / outer_num_; - } - if (top.size() == 2) { - top[1]->ShareData(prob_); - } + const vector*>& bottom, const vector*>& top) { + // The forward pass computes the softmax prob values. + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.cpu_data(); + const Dtype* label = bottom[1]->cpu_data(); + int dim = prob_.count() / outer_num_; + int count = 0; + Dtype loss = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; j++) { + const int label_value = static_cast(label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + continue; + } + DCHECK_GE(label_value, 0); + DCHECK_LT(label_value, prob_.shape(softmax_axis_)); + loss -= log( + std::max(prob_data[i * dim + label_value * inner_num_ + j], + Dtype(FLT_MIN))); + ++count; + } + } + if (normalize_) { + top[0]->mutable_cpu_data()[0] = loss / count; + } else { + top[0]->mutable_cpu_data()[0] = loss / outer_num_; + } + if (top.size() == 2) { + top[1]->ShareData(prob_); + } } template void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* prob_data = prob_.cpu_data(); - caffe_copy(prob_.count(), prob_data, bottom_diff); - const Dtype* label = bottom[1]->cpu_data(); - int dim = prob_.count() / outer_num_; - int count = 0; - for (int i = 0; i < outer_num_; ++i) { - for (int j = 0; j < inner_num_; ++j) { - const int label_value = static_cast(label[i * inner_num_ + j]); - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) { - bottom_diff[i * dim + c * inner_num_ + j] = 0; - } - } else { - bottom_diff[i * dim + label_value * inner_num_ + j] -= 1; - ++count; - } - } - } - // Scale gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - caffe_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const Dtype* prob_data = prob_.cpu_data(); + caffe_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->cpu_data(); + int dim = prob_.count() / outer_num_; + int count = 0; + for (int i = 0; i < outer_num_; ++i) { + for (int j = 0; j < inner_num_; ++j) { + const int label_value = static_cast(label[i * inner_num_ + j]); + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) { + bottom_diff[i * dim + c * inner_num_ + j] = 0; + } + } else { + bottom_diff[i * dim + label_value * inner_num_ + j] -= 1; + ++count; + } + } + } + // Scale gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + caffe_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } } // begin: code written/modified by AMD template void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is not used for anything until it is overwritten - // on the backward pass, we use it here to avoid having to allocate new GPU - // memory to accumulate intermediate results in the kernel. - Dtype* loss_data = bottom[0]->mutable_gpu_diff(); - // Similarly, this memory is never used elsewhere, and thus we can use it - // to avoid having to allocate additional GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - Dtype loss; - caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; - } - top[0]->mutable_cpu_data()[0] = loss; - if (top.size() == 2) { - top[1]->ShareData(prob_); - } + const vector*>& bottom, const vector*>& top) { + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is not used for anything until it is overwritten + // on the backward pass, we use it here to avoid having to allocate new GPU + // memory to accumulate intermediate results in the kernel. + Dtype* loss_data = bottom[0]->mutable_gpu_diff(); + // Similarly, this memory is never used elsewhere, and thus we can use it + // to avoid having to allocate additional GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossForwardGPU < Dtype + > (nthreads, prob_data, label, loss_data, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + Dtype loss; + caffe_gpu_asum(nthreads, loss_data, &loss); + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + loss /= count; + } else { + loss /= outer_num_; + } + top[0]->mutable_cpu_data()[0] = loss; + if (top.size() == 2) { + top[1]->ShareData(prob_); + } } template void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is never used for anything else, - // we use to to avoid allocating new GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is never used for anything else, + // we use to to avoid allocating new GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossBackwardGPU < Dtype + > (nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } } // end: code written/modified by AMD #ifdef CPU_ONLY diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 54bea0d6..7a40bf8a 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -8,78 +8,78 @@ namespace caffe { template void SplitLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - count_ = bottom[0]->count(); - for (int i = 0; i < top.size(); ++i) { - // Do not allow in-place computation in the SplitLayer. Instead, share data - // by reference in the forward pass, and keep separate diff allocations in - // the backward pass. (Technically, it should be possible to share the diff - // blob of the first split output with the input, but this seems to cause - // some strange effects in practice...) - CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; - top[i]->ReshapeLike(*bottom[0]); - CHECK_EQ(count_, top[i]->count()); - } - gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", - NULL); + const vector*>& top) { + count_ = bottom[0]->count(); + for (int i = 0; i < top.size(); ++i) { + // Do not allow in-place computation in the SplitLayer. Instead, share data + // by reference in the forward pass, and keep separate diff allocations in + // the backward pass. (Technically, it should be possible to share the diff + // blob of the first split output with the input, but this seems to cause + // some strange effects in practice...) + CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not " + "allow in-place computation."; + top[i]->ReshapeLike(*bottom[0]); + CHECK_EQ(count_, top[i]->count()); + } + gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", + NULL); } template void SplitLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } } template void SplitLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - if (top.size() == 1) { - caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff()); - return; - } - caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), - bottom[0]->mutable_cpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + if (top.size() == 1) { + caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff()); + return; + } + caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), + bottom[0]->mutable_cpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } } template void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } } // begin: code written/modified by AMD template void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - if (top.size() == 1) { - caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); - return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + if (top.size() == 1) { + caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); + return; + } + caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + bottom[0]->mutable_gpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } } // end: code written/modified by AMD #ifdef CPU_ONLY diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp index 4c630fb7..d552af61 100644 --- a/src/caffe/layers/spp_layer.cpp +++ b/src/caffe/layers/spp_layer.cpp @@ -15,175 +15,172 @@ using std::max; template LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param) { - LayerParameter pooling_param; - int num_bins = pow(2, pyramid_level); - - // find padding and kernel size so that the pooling is - // performed across the entire image - int kernel_h = ceil(bottom_h / static_cast(num_bins)); - // remainder_h is the min number of pixels that need to be padded before - // entire image height is pooled over with the chosen kernel dimension - int remainder_h = kernel_h * num_bins - bottom_h; - // pooling layer pads (2 * pad_h) pixels on the top and bottom of the - // image. - int pad_h = (remainder_h + 1) / 2; - - // similar logic for width - int kernel_w = ceil(bottom_w / static_cast(num_bins)); - int remainder_w = kernel_w * num_bins - bottom_w; - int pad_w = (remainder_w + 1) / 2; - - pooling_param.mutable_pooling_param()->set_pad_h(pad_h); - pooling_param.mutable_pooling_param()->set_pad_w(pad_w); - pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h); - pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w); - pooling_param.mutable_pooling_param()->set_stride_h(kernel_h); - pooling_param.mutable_pooling_param()->set_stride_w(kernel_w); - - switch (spp_param.pool()) { - case SPPParameter_PoolMethod_MAX: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); - break; - case SPPParameter_PoolMethod_AVE: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - break; - case SPPParameter_PoolMethod_STOCHASTIC: - pooling_param.mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - - return pooling_param; + const int bottom_h, const int bottom_w, const SPPParameter spp_param) { + LayerParameter pooling_param; + int num_bins = pow(2, pyramid_level); + + // find padding and kernel size so that the pooling is + // performed across the entire image + int kernel_h = ceil(bottom_h / static_cast(num_bins)); + // remainder_h is the min number of pixels that need to be padded before + // entire image height is pooled over with the chosen kernel dimension + int remainder_h = kernel_h * num_bins - bottom_h; + // pooling layer pads (2 * pad_h) pixels on the top and bottom of the + // image. + int pad_h = (remainder_h + 1) / 2; + + // similar logic for width + int kernel_w = ceil(bottom_w / static_cast(num_bins)); + int remainder_w = kernel_w * num_bins - bottom_w; + int pad_w = (remainder_w + 1) / 2; + + pooling_param.mutable_pooling_param()->set_pad_h(pad_h); + pooling_param.mutable_pooling_param()->set_pad_w(pad_w); + pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h); + pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w); + pooling_param.mutable_pooling_param()->set_stride_h(kernel_h); + pooling_param.mutable_pooling_param()->set_stride_w(kernel_w); + + switch (spp_param.pool()) { + case SPPParameter_PoolMethod_MAX: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_MAX); + break; + case SPPParameter_PoolMethod_AVE: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + break; + case SPPParameter_PoolMethod_STOCHASTIC: + pooling_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_STOCHASTIC); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + + return pooling_param; } template void SPPLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SPPParameter spp_param = this->layer_param_.spp_param(); - - bottom_h_ = bottom[0]->height(); - bottom_w_ = bottom[0]->width(); - CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero."; - CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero."; - - pyramid_height_ = spp_param.pyramid_height(); - split_top_vec_.clear(); - pooling_bottom_vecs_.clear(); - pooling_layers_.clear(); - pooling_top_vecs_.clear(); - pooling_outputs_.clear(); - flatten_layers_.clear(); - flatten_top_vecs_.clear(); - flatten_outputs_.clear(); - concat_bottom_vec_.clear(); - - // split layer output holders setup - for (int i = 0; i < pyramid_height_; i++) { - split_top_vec_.push_back(new Blob()); - } - - // split layer setup - LayerParameter split_param; - split_layer_.reset(new SplitLayer(split_param)); - split_layer_->SetUp(bottom, split_top_vec_); - - for (int i = 0; i < pyramid_height_; i++) { - // pooling layer input holders setup - pooling_bottom_vecs_.push_back(new vector*>); - pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]); - - // pooling layer output holders setup - pooling_outputs_.push_back(new Blob()); - pooling_top_vecs_.push_back(new vector*>); - pooling_top_vecs_[i]->push_back(pooling_outputs_[i]); - - // pooling layer setup - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); - - pooling_layers_.push_back(shared_ptr < PoolingLayer > ( - new PoolingLayer(pooling_param))); - pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - - // flatten layer output holders setup - flatten_outputs_.push_back(new Blob()); - flatten_top_vecs_.push_back(new vector*>); - flatten_top_vecs_[i]->push_back(flatten_outputs_[i]); - - // flatten layer setup - LayerParameter flatten_param; - flatten_layers_.push_back(new FlattenLayer(flatten_param)); - flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); - - // concat layer input holders setup - concat_bottom_vec_.push_back(flatten_outputs_[i]); - } - - // concat layer setup - LayerParameter concat_param; - concat_layer_.reset(new ConcatLayer(concat_param)); - concat_layer_->SetUp(concat_bottom_vec_, top); + const vector*>& top) { + SPPParameter spp_param = this->layer_param_.spp_param(); + + bottom_h_ = bottom[0]->height(); + bottom_w_ = bottom[0]->width(); + CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero."; + CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero."; + + pyramid_height_ = spp_param.pyramid_height(); + split_top_vec_.clear(); + pooling_bottom_vecs_.clear(); + pooling_layers_.clear(); + pooling_top_vecs_.clear(); + pooling_outputs_.clear(); + flatten_layers_.clear(); + flatten_top_vecs_.clear(); + flatten_outputs_.clear(); + concat_bottom_vec_.clear(); + + // split layer output holders setup + for (int i = 0; i < pyramid_height_; i++) { + split_top_vec_.push_back(new Blob()); + } + + // split layer setup + LayerParameter split_param; + split_layer_.reset(new SplitLayer(split_param)); + split_layer_->SetUp(bottom, split_top_vec_); + + for (int i = 0; i < pyramid_height_; i++) { + // pooling layer input holders setup + pooling_bottom_vecs_.push_back(new vector*>); + pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]); + + // pooling layer output holders setup + pooling_outputs_.push_back(new Blob()); + pooling_top_vecs_.push_back(new vector*>); + pooling_top_vecs_[i]->push_back(pooling_outputs_[i]); + + // pooling layer setup + LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_, + spp_param); + + pooling_layers_.push_back( + shared_ptr < PoolingLayer + > (new PoolingLayer(pooling_param))); + pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + + // flatten layer output holders setup + flatten_outputs_.push_back(new Blob()); + flatten_top_vecs_.push_back(new vector*>); + flatten_top_vecs_[i]->push_back(flatten_outputs_[i]); + + // flatten layer setup + LayerParameter flatten_param; + flatten_layers_.push_back(new FlattenLayer(flatten_param)); + flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); + + // concat layer input holders setup + concat_bottom_vec_.push_back(flatten_outputs_[i]); + } + + // concat layer setup + LayerParameter concat_param; + concat_layer_.reset(new ConcatLayer(concat_param)); + concat_layer_->SetUp(concat_bottom_vec_, top); } template void SPPLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " - << "corresponding to (num, channels, height, width)"; - channels_ = bottom[0]->channels(); - bottom_h_ = bottom[0]->height(); - bottom_w_ = bottom[0]->width(); - SPPParameter spp_param = this->layer_param_.spp_param(); - split_layer_->Reshape(bottom, split_top_vec_); - for (int i = 0; i < pyramid_height_; i++) { - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); - - pooling_layers_[i].reset( - new PoolingLayer(pooling_param)); - pooling_layers_[i]->SetUp( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - pooling_layers_[i]->Reshape( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Reshape( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); - } - concat_layer_->Reshape(concat_bottom_vec_, top); + const vector*>& top) { + CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + channels_ = bottom[0]->channels(); + bottom_h_ = bottom[0]->height(); + bottom_w_ = bottom[0]->width(); + SPPParameter spp_param = this->layer_param_.spp_param(); + split_layer_->Reshape(bottom, split_top_vec_); + for (int i = 0; i < pyramid_height_; i++) { + LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_, + spp_param); + + pooling_layers_[i].reset(new PoolingLayer(pooling_param)); + pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + pooling_layers_[i]->Reshape(*pooling_bottom_vecs_[i], + *pooling_top_vecs_[i]); + flatten_layers_[i]->Reshape(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); + } + concat_layer_->Reshape(concat_bottom_vec_, top); } template void SPPLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - split_layer_->Forward(bottom, split_top_vec_); - for (int i = 0; i < pyramid_height_; i++) { - pooling_layers_[i]->Forward( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Forward( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); - } - concat_layer_->Forward(concat_bottom_vec_, top); + const vector*>& top) { + split_layer_->Forward(bottom, split_top_vec_); + for (int i = 0; i < pyramid_height_; i++) { + pooling_layers_[i]->Forward(*pooling_bottom_vecs_[i], + *pooling_top_vecs_[i]); + flatten_layers_[i]->Forward(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); + } + concat_layer_->Forward(concat_bottom_vec_, top); } template void SPPLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - vector concat_propagate_down(pyramid_height_, true); - concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); - for (int i = 0; i < pyramid_height_; i++) { - flatten_layers_[i]->Backward( - *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); - pooling_layers_[i]->Backward( - *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); - } - split_layer_->Backward(split_top_vec_, propagate_down, bottom); + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + vector concat_propagate_down(pyramid_height_, true); + concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); + for (int i = 0; i < pyramid_height_; i++) { + flatten_layers_[i]->Backward(*flatten_top_vecs_[i], propagate_down, + *pooling_top_vecs_[i]); + pooling_layers_[i]->Backward(*pooling_top_vecs_[i], propagate_down, + *pooling_bottom_vecs_[i]); + } + split_layer_->Backward(split_top_vec_, propagate_down, bottom); } INSTANTIATE_CLASS (SPPLayer); diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 52a8a8c7..3e85330c 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -12,53 +12,52 @@ namespace caffe { template void TanHLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = tanh(bottom_data[i]); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = tanh(bottom_data[i]); + } } template void TanHLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const int count = bottom[0]->count(); - Dtype tanhx; - for (int i = 0; i < count; ++i) { - tanhx = top_data[i]; - bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx); - } - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + const int count = bottom[0]->count(); + Dtype tanhx; + for (int i = 0; i < count; ++i) { + tanhx = top_data[i]; + bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx); + } + } } template void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHForward(count, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHForward(count, bottom_data, top_data); } template void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHBackward(count, top_diff, top_data, bottom_diff); - } + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHBackward(count, top_diff, top_data, bottom_diff); + } } #ifdef CPU_ONLY diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index 7d99226f..16ca8944 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -8,30 +8,30 @@ namespace caffe { template void ThresholdLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer < Dtype > ::LayerSetUp(bottom, top); - threshold_ = this->layer_param_.threshold_param().threshold(); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.threshold_param().threshold(); } template void ThresholdLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - const int count = bottom[0]->count(); - for (int i = 0; i < count; ++i) { - top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); - } + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const int count = bottom[0]->count(); + for (int i = 0; i < count; ++i) { + top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); + } } template void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - ThresholdForward(count, threshold_, bottom_data, top_data); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + ThresholdForward(count, threshold_, bottom_data, top_data); } #ifdef CPU_ONLY diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index 68b1b1e5..7085ac63 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -27,406 +27,400 @@ namespace caffe { template WindowDataLayer::~WindowDataLayer() { - this->JoinPrefetchThread(); + this->JoinPrefetchThread(); } template void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { - // LayerSetUp runs through the window_file and creates two structures - // that hold windows: one for foreground (object) windows and one - // for background (non-object) windows. We use an overlap threshold - // to decide which is which. - - // window_file format - // repeated: - // # image_index - // img_path (abs path) - // channels - // height - // width - // num_windows - // class_index overlap x1 y1 x2 y2 - - LOG(INFO) << "Window data layer:" << std::endl - << " foreground (object) overlap threshold: " - << this->layer_param_.window_data_param().fg_threshold() << std::endl - << " background (non-object) overlap threshold: " - << this->layer_param_.window_data_param().bg_threshold() << std::endl - << " foreground sampling fraction: " - << this->layer_param_.window_data_param().fg_fraction() << std::endl - << " cache_images: " - << this->layer_param_.window_data_param().cache_images() << std::endl - << " root_folder: " - << this->layer_param_.window_data_param().root_folder(); - - cache_images_ = this->layer_param_.window_data_param().cache_images(); - string root_folder = this->layer_param_.window_data_param().root_folder(); - - const bool prefetch_needs_rand = - this->transform_param_.mirror() || - this->transform_param_.crop_size(); - if (prefetch_needs_rand) { - const unsigned int prefetch_rng_seed = caffe_rng_rand(); - prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); - } else { - prefetch_rng_.reset(); - } - - std::ifstream infile(this->layer_param_.window_data_param().source().c_str()); - CHECK(infile.good()) << "Failed to open window file " - << this->layer_param_.window_data_param().source() << std::endl; - - map label_hist; - label_hist.insert(std::make_pair(0, 0)); - - string hashtag; - int image_index, channels; - if (!(infile >> hashtag >> image_index)) { - LOG(FATAL) << "Window file is empty"; - } - do { - CHECK_EQ(hashtag, "#"); - // read image path - string image_path; - infile >> image_path; - image_path = root_folder + image_path; - // read image dimensions - vector image_size(3); - infile >> image_size[0] >> image_size[1] >> image_size[2]; - channels = image_size[0]; - image_database_.push_back(std::make_pair(image_path, image_size)); - - if (cache_images_) { - Datum datum; - if (!ReadFileToDatum(image_path, &datum)) { - LOG(ERROR) << "Could not open or find file " << image_path; - return; - } - image_database_cache_.push_back(std::make_pair(image_path, datum)); - } - // read each box - int num_windows; - infile >> num_windows; - const float fg_threshold = - this->layer_param_.window_data_param().fg_threshold(); - const float bg_threshold = - this->layer_param_.window_data_param().bg_threshold(); - for (int i = 0; i < num_windows; ++i) { - int label, x1, y1, x2, y2; - float overlap; - infile >> label >> overlap >> x1 >> y1 >> x2 >> y2; - - vector window(WindowDataLayer::NUM); - window[WindowDataLayer::IMAGE_INDEX] = image_index; - window[WindowDataLayer::LABEL] = label; - window[WindowDataLayer::OVERLAP] = overlap; - window[WindowDataLayer::X1] = x1; - window[WindowDataLayer::Y1] = y1; - window[WindowDataLayer::X2] = x2; - window[WindowDataLayer::Y2] = y2; - - // add window to foreground list or background list - if (overlap >= fg_threshold) { - int label = window[WindowDataLayer::LABEL]; - CHECK_GT(label, 0); - fg_windows_.push_back(window); - label_hist.insert(std::make_pair(label, 0)); - label_hist[label]++; - } else if (overlap < bg_threshold) { - // background window, force label and overlap to 0 - window[WindowDataLayer::LABEL] = 0; - window[WindowDataLayer::OVERLAP] = 0; - bg_windows_.push_back(window); - label_hist[0]++; - } - } - - if (image_index % 100 == 0) { - LOG(INFO) << "num: " << image_index << " " - << image_path << " " - << image_size[0] << " " - << image_size[1] << " " - << image_size[2] << " " - << "windows to process: " << num_windows; - } - } while (infile >> hashtag >> image_index); - - LOG(INFO) << "Number of images: " << image_index + 1; - - for (map::iterator it = label_hist.begin(); - it != label_hist.end(); ++it) { - LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] - << " samples"; - } - - LOG(INFO) << "Amount of context padding: " - << this->layer_param_.window_data_param().context_pad(); - - LOG(INFO) << "Crop mode: " - << this->layer_param_.window_data_param().crop_mode(); - - // image - const int crop_size = this->transform_param_.crop_size(); - CHECK_GT(crop_size, 0); - const int batch_size = this->layer_param_.window_data_param().batch_size(); - top[0]->Reshape(batch_size, channels, crop_size, crop_size); - this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); - - LOG(INFO) << "output data size: " << top[0]->num() << "," - << top[0]->channels() << "," << top[0]->height() << "," - << top[0]->width(); - // label - vector label_shape(1, batch_size); - top[1]->Reshape(label_shape); - this->prefetch_label_.Reshape(label_shape); - - // data mean - has_mean_file_ = this->transform_param_.has_mean_file(); - has_mean_values_ = this->transform_param_.mean_value_size() > 0; - if (has_mean_file_) { - const string& mean_file = - this->transform_param_.mean_file(); - LOG(INFO) << "Loading mean file from: " << mean_file; - BlobProto blob_proto; - ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); - data_mean_.FromProto(blob_proto); - } - if (has_mean_values_) { - CHECK(has_mean_file_ == false) << - "Cannot specify mean_file and mean_value at the same time"; - for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { - mean_values_.push_back(this->transform_param_.mean_value(c)); - } - CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) << - "Specify either 1 mean_value or as many as channels: " << channels; - if (channels > 1 && mean_values_.size() == 1) { - // Replicate the mean_value for simplicity - for (int c = 1; c < channels; ++c) { - mean_values_.push_back(mean_values_[0]); - } - } - } + const vector*>& top) { + // LayerSetUp runs through the window_file and creates two structures + // that hold windows: one for foreground (object) windows and one + // for background (non-object) windows. We use an overlap threshold + // to decide which is which. + + // window_file format + // repeated: + // # image_index + // img_path (abs path) + // channels + // height + // width + // num_windows + // class_index overlap x1 y1 x2 y2 + + LOG(INFO) << "Window data layer:" << std::endl + << " foreground (object) overlap threshold: " + << this->layer_param_.window_data_param().fg_threshold() << std::endl + << " background (non-object) overlap threshold: " + << this->layer_param_.window_data_param().bg_threshold() << std::endl + << " foreground sampling fraction: " + << this->layer_param_.window_data_param().fg_fraction() << std::endl + << " cache_images: " + << this->layer_param_.window_data_param().cache_images() << std::endl + << " root_folder: " + << this->layer_param_.window_data_param().root_folder(); + + cache_images_ = this->layer_param_.window_data_param().cache_images(); + string root_folder = this->layer_param_.window_data_param().root_folder(); + + const bool prefetch_needs_rand = this->transform_param_.mirror() + || this->transform_param_.crop_size(); + if (prefetch_needs_rand) { + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + } else { + prefetch_rng_.reset(); + } + + std::ifstream infile(this->layer_param_.window_data_param().source().c_str()); + CHECK(infile.good()) << "Failed to open window file " + << this->layer_param_.window_data_param().source() << std::endl; + + map label_hist; + label_hist.insert(std::make_pair(0, 0)); + + string hashtag; + int image_index, channels; + if (!(infile >> hashtag >> image_index)) { + LOG(FATAL) << "Window file is empty"; + } + do { + CHECK_EQ(hashtag, "#"); + // read image path + string image_path; + infile >> image_path; + image_path = root_folder + image_path; + // read image dimensions + vector image_size(3); + infile >> image_size[0] >> image_size[1] >> image_size[2]; + channels = image_size[0]; + image_database_.push_back(std::make_pair(image_path, image_size)); + + if (cache_images_) { + Datum datum; + if (!ReadFileToDatum(image_path, &datum)) { + LOG(ERROR) << "Could not open or find file " << image_path; + return; + } + image_database_cache_.push_back(std::make_pair(image_path, datum)); + } + // read each box + int num_windows; + infile >> num_windows; + const float fg_threshold = + this->layer_param_.window_data_param().fg_threshold(); + const float bg_threshold = + this->layer_param_.window_data_param().bg_threshold(); + for (int i = 0; i < num_windows; ++i) { + int label, x1, y1, x2, y2; + float overlap; + infile >> label >> overlap >> x1 >> y1 >> x2 >> y2; + + vector window(WindowDataLayer::NUM); + window[WindowDataLayer::IMAGE_INDEX] = image_index; + window[WindowDataLayer::LABEL] = label; + window[WindowDataLayer::OVERLAP] = overlap; + window[WindowDataLayer::X1] = x1; + window[WindowDataLayer::Y1] = y1; + window[WindowDataLayer::X2] = x2; + window[WindowDataLayer::Y2] = y2; + + // add window to foreground list or background list + if (overlap >= fg_threshold) { + int label = window[WindowDataLayer::LABEL]; + CHECK_GT(label, 0); + fg_windows_.push_back(window); + label_hist.insert(std::make_pair(label, 0)); + label_hist[label]++; + } else if (overlap < bg_threshold) { + // background window, force label and overlap to 0 + window[WindowDataLayer::LABEL] = 0; + window[WindowDataLayer::OVERLAP] = 0; + bg_windows_.push_back(window); + label_hist[0]++; + } + } + + if (image_index % 100 == 0) { + LOG(INFO) << "num: " << image_index << " " << image_path << " " + << image_size[0] << " " << image_size[1] << " " << image_size[2] + << " " << "windows to process: " << num_windows; + } + } while (infile >> hashtag >> image_index); + + LOG(INFO) << "Number of images: " << image_index + 1; + + for (map::iterator it = label_hist.begin(); it != label_hist.end(); + ++it) { + LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] + << " samples"; + } + + LOG(INFO) << "Amount of context padding: " + << this->layer_param_.window_data_param().context_pad(); + + LOG(INFO) << "Crop mode: " + << this->layer_param_.window_data_param().crop_mode(); + + // image + const int crop_size = this->transform_param_.crop_size(); + CHECK_GT(crop_size, 0); + const int batch_size = this->layer_param_.window_data_param().batch_size(); + top[0]->Reshape(batch_size, channels, crop_size, crop_size); + this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size); + + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + vector label_shape(1, batch_size); + top[1]->Reshape(label_shape); + this->prefetch_label_.Reshape(label_shape); + + // data mean + has_mean_file_ = this->transform_param_.has_mean_file(); + has_mean_values_ = this->transform_param_.mean_value_size() > 0; + if (has_mean_file_) { + const string& mean_file = this->transform_param_.mean_file(); + LOG(INFO) << "Loading mean file from: " << mean_file; + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + data_mean_.FromProto(blob_proto); + } + if (has_mean_values_) { + CHECK(has_mean_file_ == false) + << "Cannot specify mean_file and mean_value at the same time"; + for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { + mean_values_.push_back(this->transform_param_.mean_value(c)); + } + CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) + << "Specify either 1 mean_value or as many as channels: " << channels; + if (channels > 1 && mean_values_.size() == 1) { + // Replicate the mean_value for simplicity + for (int c = 1; c < channels; ++c) { + mean_values_.push_back(mean_values_[0]); + } + } + } } template unsigned int WindowDataLayer::PrefetchRand() { - CHECK (prefetch_rng_); - caffe::rng_t* prefetch_rng = - static_cast(prefetch_rng_->generator()); - return (*prefetch_rng)(); + CHECK (prefetch_rng_); + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + return (*prefetch_rng)(); } // Thread fetching the data template void WindowDataLayer::InternalThreadEntry() { - // At each iteration, sample N windows where N*p are foreground (object) - // windows and N*(1-p) are background (non-object) windows - CPUTimer batch_timer; - batch_timer.Start(); - double read_time = 0; - double trans_time = 0; - CPUTimer timer; - Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); - Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); - const Dtype scale = this->layer_param_.window_data_param().scale(); - const int batch_size = this->layer_param_.window_data_param().batch_size(); - const int context_pad = this->layer_param_.window_data_param().context_pad(); - const int crop_size = this->transform_param_.crop_size(); - const bool mirror = this->transform_param_.mirror(); - const float fg_fraction = - this->layer_param_.window_data_param().fg_fraction(); - Dtype* mean = NULL; - int mean_off = 0; - int mean_width = 0; - int mean_height = 0; - if (this->has_mean_file_) { - mean = this->data_mean_.mutable_cpu_data(); - mean_off = (this->data_mean_.width() - crop_size) / 2; - mean_width = this->data_mean_.width(); - mean_height = this->data_mean_.height(); - } - cv::Size cv_crop_size(crop_size, crop_size); - const string& crop_mode = this->layer_param_.window_data_param().crop_mode(); - - bool use_square = (crop_mode == "square") ? true : false; - - // zero out batch - caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); - - const int num_fg = static_cast(static_cast(batch_size) - * fg_fraction); - const int num_samples[2] = { batch_size - num_fg, num_fg }; - - int item_id = 0; - // sample from bg set then fg set - for (int is_fg = 0; is_fg < 2; ++is_fg) { - for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) { - // sample a window - timer.Start(); - const unsigned int rand_index = PrefetchRand(); - vector window = - (is_fg) ? - fg_windows_[rand_index % fg_windows_.size()] : - bg_windows_[rand_index % bg_windows_.size()]; - - bool do_mirror = mirror && PrefetchRand() % 2; - - // load the image containing the window - pair > image = - image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; - - cv::Mat cv_img; - if (this->cache_images_) { - pair < std::string, Datum > image_cached = - image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; - cv_img = DecodeDatumToCVMat(image_cached.second, true); - } else { - cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); - if (!cv_img.data) { - LOG(ERROR) << "Could not open or find file " << image.first; - return; - } - } - read_time += timer.MicroSeconds(); - timer.Start(); - const int channels = cv_img.channels(); - - // crop window out of image and warp it - int x1 = window[WindowDataLayer < Dtype > ::X1]; - int y1 = window[WindowDataLayer < Dtype > ::Y1]; - int x2 = window[WindowDataLayer < Dtype > ::X2]; - int y2 = window[WindowDataLayer < Dtype > ::Y2]; - - int pad_w = 0; - int pad_h = 0; - if (context_pad > 0 || use_square) { - // scale factor by which to expand the original region - // such that after warping the expanded region to crop_size x crop_size - // there's exactly context_pad amount of padding on each side - Dtype context_scale = static_cast(crop_size) / - static_cast(crop_size - 2 * context_pad); - - // compute the expanded region - Dtype half_height = static_cast(y2 - y1 + 1) / 2.0; - Dtype half_width = static_cast(x2 - x1 + 1) / 2.0; - Dtype center_x = static_cast(x1) + half_width; - Dtype center_y = static_cast(y1) + half_height; - if (use_square) { - if (half_height > half_width) { - half_width = half_height; - } else { - half_height = half_width; - } - } - x1 = static_cast(round(center_x - half_width * context_scale)); - x2 = static_cast(round(center_x + half_width * context_scale)); - y1 = static_cast(round(center_y - half_height * context_scale)); - y2 = static_cast(round(center_y + half_height * context_scale)); - - // the expanded region may go outside of the image - // so we compute the clipped (expanded) region and keep track of - // the extent beyond the image - int unclipped_height = y2 - y1 + 1; - int unclipped_width = x2 - x1 + 1; - int pad_x1 = std::max(0, -x1); - int pad_y1 = std::max(0, -y1); - int pad_x2 = std::max(0, x2 - cv_img.cols + 1); - int pad_y2 = std::max(0, y2 - cv_img.rows + 1); - // clip bounds - x1 = x1 + pad_x1; - x2 = x2 - pad_x2; - y1 = y1 + pad_y1; - y2 = y2 - pad_y2; - CHECK_GT(x1, -1); - CHECK_GT(y1, -1); - CHECK_LT(x2, cv_img.cols); - CHECK_LT(y2, cv_img.rows); - - int clipped_height = y2 - y1 + 1; - int clipped_width = x2 - x1 + 1; - - // scale factors that would be used to warp the unclipped - // expanded region - Dtype scale_x = - static_cast(crop_size) / static_cast(unclipped_width); - Dtype scale_y = - static_cast(crop_size) - / static_cast(unclipped_height); - - // size to warp the clipped expanded region to - cv_crop_size.width = - static_cast(round(static_cast(clipped_width) * scale_x)); - cv_crop_size.height = - static_cast(round(static_cast(clipped_height) * scale_y)); - pad_x1 = static_cast(round(static_cast(pad_x1) * scale_x)); - pad_x2 = static_cast(round(static_cast(pad_x2) * scale_x)); - pad_y1 = static_cast(round(static_cast(pad_y1) * scale_y)); - pad_y2 = static_cast(round(static_cast(pad_y2) * scale_y)); - - pad_h = pad_y1; - // if we're mirroring, we mirror the padding too (to be pedantic) - if (do_mirror) { - pad_w = pad_x2; - } else { - pad_w = pad_x1; - } - - // ensure that the warped, clipped region plus the padding fits in the - // crop_size x crop_size image (it might not due to rounding) - if (pad_h + cv_crop_size.height > crop_size) { - cv_crop_size.height = crop_size - pad_h; - } - if (pad_w + cv_crop_size.width > crop_size) { - cv_crop_size.width = crop_size - pad_w; - } - } - - cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1); - cv::Mat cv_cropped_img = cv_img(roi); - cv::resize(cv_cropped_img, cv_cropped_img, - cv_crop_size, 0, 0, cv::INTER_LINEAR); - - // horizontal flip at random - if (do_mirror) { - cv::flip(cv_cropped_img, cv_cropped_img, 1); - } - - // copy the warped window into top_data - for (int h = 0; h < cv_cropped_img.rows; ++h) { - const uchar* ptr = cv_cropped_img.ptr < uchar > (h); - int img_index = 0; - for (int w = 0; w < cv_cropped_img.cols; ++w) { - for (int c = 0; c < channels; ++c) { - int top_index = ((item_id * channels + c) * crop_size + h + pad_h) - * crop_size + w + pad_w; - // int top_index = (c * height + h) * width + w; - Dtype pixel = static_cast(ptr[img_index++]); - if (this->has_mean_file_) { - int mean_index = (c * mean_height + h + mean_off + pad_h) - * mean_width + w + mean_off + pad_w; - top_data[top_index] = (pixel - mean[mean_index]) * scale; - } else { - if (this->has_mean_values_) { - top_data[top_index] = (pixel - this->mean_values_[c]) * scale; - } else { - top_data[top_index] = pixel * scale; - } - } - } - } - } - trans_time += timer.MicroSeconds(); - // get window label - top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL]; - - item_id++; - } - } - batch_timer.Stop(); - DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; - DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; - DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; + // At each iteration, sample N windows where N*p are foreground (object) + // windows and N*(1-p) are background (non-object) windows + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + Dtype* top_data = this->prefetch_data_.mutable_cpu_data(); + Dtype* top_label = this->prefetch_label_.mutable_cpu_data(); + const Dtype scale = this->layer_param_.window_data_param().scale(); + const int batch_size = this->layer_param_.window_data_param().batch_size(); + const int context_pad = this->layer_param_.window_data_param().context_pad(); + const int crop_size = this->transform_param_.crop_size(); + const bool mirror = this->transform_param_.mirror(); + const float fg_fraction = + this->layer_param_.window_data_param().fg_fraction(); + Dtype* mean = NULL; + int mean_off = 0; + int mean_width = 0; + int mean_height = 0; + if (this->has_mean_file_) { + mean = this->data_mean_.mutable_cpu_data(); + mean_off = (this->data_mean_.width() - crop_size) / 2; + mean_width = this->data_mean_.width(); + mean_height = this->data_mean_.height(); + } + cv::Size cv_crop_size(crop_size, crop_size); + const string& crop_mode = this->layer_param_.window_data_param().crop_mode(); + + bool use_square = (crop_mode == "square") ? true : false; + + // zero out batch + caffe_set(this->prefetch_data_.count(), Dtype(0), top_data); + + const int num_fg = static_cast(static_cast(batch_size) + * fg_fraction); + const int num_samples[2] = { batch_size - num_fg, num_fg }; + + int item_id = 0; + // sample from bg set then fg set + for (int is_fg = 0; is_fg < 2; ++is_fg) { + for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) { + // sample a window + timer.Start(); + const unsigned int rand_index = PrefetchRand(); + vector window = + (is_fg) ? + fg_windows_[rand_index % fg_windows_.size()] : + bg_windows_[rand_index % bg_windows_.size()]; + + bool do_mirror = mirror && PrefetchRand() % 2; + + // load the image containing the window + pair > image = + image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + + cv::Mat cv_img; + if (this->cache_images_) { + pair < std::string, Datum > image_cached = + image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; + cv_img = DecodeDatumToCVMat(image_cached.second, true); + } else { + cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); + if (!cv_img.data) { + LOG(ERROR) << "Could not open or find file " << image.first; + return; + } + } + read_time += timer.MicroSeconds(); + timer.Start(); + const int channels = cv_img.channels(); + + // crop window out of image and warp it + int x1 = window[WindowDataLayer < Dtype > ::X1]; + int y1 = window[WindowDataLayer < Dtype > ::Y1]; + int x2 = window[WindowDataLayer < Dtype > ::X2]; + int y2 = window[WindowDataLayer < Dtype > ::Y2]; + + int pad_w = 0; + int pad_h = 0; + if (context_pad > 0 || use_square) { + // scale factor by which to expand the original region + // such that after warping the expanded region to crop_size x crop_size + // there's exactly context_pad amount of padding on each side + Dtype context_scale = static_cast(crop_size) + / static_cast(crop_size - 2 * context_pad); + + // compute the expanded region + Dtype half_height = static_cast(y2 - y1 + 1) / 2.0; + Dtype half_width = static_cast(x2 - x1 + 1) / 2.0; + Dtype center_x = static_cast(x1) + half_width; + Dtype center_y = static_cast(y1) + half_height; + if (use_square) { + if (half_height > half_width) { + half_width = half_height; + } else { + half_height = half_width; + } + } + x1 = static_cast(round(center_x - half_width * context_scale)); + x2 = static_cast(round(center_x + half_width * context_scale)); + y1 = static_cast(round(center_y - half_height * context_scale)); + y2 = static_cast(round(center_y + half_height * context_scale)); + + // the expanded region may go outside of the image + // so we compute the clipped (expanded) region and keep track of + // the extent beyond the image + int unclipped_height = y2 - y1 + 1; + int unclipped_width = x2 - x1 + 1; + int pad_x1 = std::max(0, -x1); + int pad_y1 = std::max(0, -y1); + int pad_x2 = std::max(0, x2 - cv_img.cols + 1); + int pad_y2 = std::max(0, y2 - cv_img.rows + 1); + // clip bounds + x1 = x1 + pad_x1; + x2 = x2 - pad_x2; + y1 = y1 + pad_y1; + y2 = y2 - pad_y2; + CHECK_GT(x1, -1); + CHECK_GT(y1, -1); + CHECK_LT(x2, cv_img.cols); + CHECK_LT(y2, cv_img.rows); + + int clipped_height = y2 - y1 + 1; + int clipped_width = x2 - x1 + 1; + + // scale factors that would be used to warp the unclipped + // expanded region + Dtype scale_x = static_cast(crop_size) + / static_cast(unclipped_width); + Dtype scale_y = static_cast(crop_size) + / static_cast(unclipped_height); + + // size to warp the clipped expanded region to + cv_crop_size.width = static_cast(round( + static_cast(clipped_width) * scale_x)); + cv_crop_size.height = static_cast(round( + static_cast(clipped_height) * scale_y)); + pad_x1 = static_cast(round(static_cast(pad_x1) * scale_x)); + pad_x2 = static_cast(round(static_cast(pad_x2) * scale_x)); + pad_y1 = static_cast(round(static_cast(pad_y1) * scale_y)); + pad_y2 = static_cast(round(static_cast(pad_y2) * scale_y)); + + pad_h = pad_y1; + // if we're mirroring, we mirror the padding too (to be pedantic) + if (do_mirror) { + pad_w = pad_x2; + } else { + pad_w = pad_x1; + } + + // ensure that the warped, clipped region plus the padding fits in the + // crop_size x crop_size image (it might not due to rounding) + if (pad_h + cv_crop_size.height > crop_size) { + cv_crop_size.height = crop_size - pad_h; + } + if (pad_w + cv_crop_size.width > crop_size) { + cv_crop_size.width = crop_size - pad_w; + } + } + + cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1); + cv::Mat cv_cropped_img = cv_img(roi); + cv::resize(cv_cropped_img, cv_cropped_img, cv_crop_size, 0, 0, + cv::INTER_LINEAR); + + // horizontal flip at random + if (do_mirror) { + cv::flip(cv_cropped_img, cv_cropped_img, 1); + } + + // copy the warped window into top_data + for (int h = 0; h < cv_cropped_img.rows; ++h) { + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < cv_cropped_img.cols; ++w) { + for (int c = 0; c < channels; ++c) { + int top_index = ((item_id * channels + c) * crop_size + h + pad_h) + * crop_size + w + pad_w; + // int top_index = (c * height + h) * width + w; + Dtype pixel = static_cast(ptr[img_index++]); + if (this->has_mean_file_) { + int mean_index = (c * mean_height + h + mean_off + pad_h) + * mean_width + w + mean_off + pad_w; + top_data[top_index] = (pixel - mean[mean_index]) * scale; + } else { + if (this->has_mean_values_) { + top_data[top_index] = (pixel - this->mean_values_[c]) * scale; + } else { + top_data[top_index] = pixel * scale; + } + } + } + } + } + trans_time += timer.MicroSeconds(); + // get window label + top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL]; + + item_id++; + } + } + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } INSTANTIATE_CLASS (WindowDataLayer); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 23085112..6911854c 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -21,897 +21,886 @@ namespace caffe { template Net::Net(const NetParameter& param) { - Init(param); + Init(param); } template Net::Net(const string& param_file, Phase phase) { - NetParameter param; - ReadNetParamsFromTextFileOrDie(param_file, ¶m); - param.mutable_state()->set_phase(phase); - Init(param); + NetParameter param; + ReadNetParamsFromTextFileOrDie(param_file, ¶m); + param.mutable_state()->set_phase(phase); + Init(param); } template void Net::Init(const NetParameter& in_param) { - // Set phase from the state. - phase_ = in_param.state().phase(); - // Filter layers based on their include/exclude rules and - // the current NetState. - NetParameter filtered_param; - FilterNet(in_param, &filtered_param); - LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); - // Create a copy of filtered_param with splits added where necessary. - NetParameter param; - InsertSplits(filtered_param, ¶m); - // Basically, build all the layers and set up their connections. - name_ = param.name(); - map blob_name_to_idx; - set < string > available_blobs; - CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) - << "Must specify either input_shape OR deprecated input_dim, not both."; - if (param.input_dim_size() > 0) { - // Deprecated 4D dimensions. - CHECK_EQ(param.input_size() * 4, param.input_dim_size()) - << "Incorrect input blob dimension specifications."; - } else { - CHECK_EQ(param.input_size(), param.input_shape_size()) - << "Exactly one input_shape must be specified per input."; - } - memory_used_ = 0; - // set the input blobs - for (int input_id = 0; input_id < param.input_size(); ++input_id) { - const int layer_id = -1; // inputs have fake layer ID -1 - AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - // For each layer, set up its input and output - bottom_vecs_.resize(param.layer_size()); - top_vecs_.resize(param.layer_size()); - bottom_id_vecs_.resize(param.layer_size()); - param_id_vecs_.resize(param.layer_size()); - top_id_vecs_.resize(param.layer_size()); - bottom_need_backward_.resize(param.layer_size()); - for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { - // Inherit phase from net if unset. - if (!param.layer(layer_id).has_phase()) { - param.mutable_layer(layer_id)->set_phase(phase_); - } - // Setup layer. - const LayerParameter& layer_param = param.layer(layer_id); - if (layer_param.propagate_down_size() > 0) { - CHECK_EQ(layer_param.propagate_down_size(), - layer_param.bottom_size()) - << "propagate_down param must be specified " - << "either 0 or bottom_size times "; - } - layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param)); - layer_names_.push_back(layer_param.name()); - LOG(INFO) << "Creating Layer " << layer_param.name(); - bool need_backward = false; - - // Figure out this layer's input and output - for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); - ++bottom_id) { - const int blob_id = AppendBottom(param, layer_id, bottom_id, - &available_blobs, &blob_name_to_idx); - // If a blob needs backward, this layer should provide it. - need_backward |= blob_need_backward_[blob_id]; - } - int num_top = layer_param.top_size(); - for (int top_id = 0; top_id < num_top; ++top_id) { - AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); - } - // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter - // specified fewer than the required number (as specified by - // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. - Layer < Dtype > *layer = layers_[layer_id].get(); - if (layer->AutoTopBlobs()) { - const int needed_num_top = - std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); - for (; num_top < needed_num_top; ++num_top) { - // Add "anonymous" top blobs -- do not modify available_blobs or - // blob_name_to_idx as we don't want these blobs to be usable as input - // to other layers. - AppendTop(param, layer_id, num_top, NULL, NULL); - } - } - // After this layer is connected, set it up. - LOG(INFO) << "Setting up " << layer_names_[layer_id]; - layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { - blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); - } - blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); - LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); - if (layer->loss(top_id)) { - LOG(INFO) << " with loss weight " << layer->loss(top_id); - } - memory_used_ += top_vecs_[layer_id][top_id]->count(); - } - DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); - const int param_size = layer_param.param_size(); - const int num_param_blobs = layers_[layer_id]->blobs().size(); - CHECK_LE(param_size, num_param_blobs) - << "Too many params specified for layer " << layer_param.name(); - ParamSpec default_param_spec; - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - const ParamSpec* param_spec = - (param_id < param_size) ? - &layer_param.param(param_id) : - &default_param_spec; - const bool param_need_backward = param_spec->lr_mult() > 0; - need_backward |= param_need_backward; - layers_[layer_id]->set_param_propagate_down(param_id, - param_need_backward); - } - for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - AppendParam(param, layer_id, param_id); - } - // Finally, set the backward flag - layer_need_backward_.push_back(need_backward); - if (need_backward) { - for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { - blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; - } - } - } - // Go through the net backwards to determine which blobs contribute to the - // loss. We can skip backward computation for blobs that don't contribute - // to the loss. - // Also checks if all bottom blobs don't need backward computation (possible - // because the skip_propagate_down param) and so we can skip bacward - // computation for the entire layer - set < string > blobs_under_loss; - set < string > blobs_skip_backp; - for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { - bool layer_contributes_loss = false; - bool layer_skip_propagate_down = true; - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - if (layers_[layer_id]->loss(top_id) || - (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { - layer_contributes_loss = true; - } - if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { - layer_skip_propagate_down = false; - } - if (layer_contributes_loss && !layer_skip_propagate_down) - break; - } - // If this layer can skip backward computation, also all his bottom blobs - // don't need backpropagation - if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { - layer_need_backward_[layer_id] = false; - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = false; - } - } - if (!layer_contributes_loss) { - layer_need_backward_[layer_id] = false; - } - if (layer_need_backward_[layer_id]) { - LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; - } else { - LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; - } - for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { - if (layer_contributes_loss) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_under_loss.insert(blob_name); - } else { - bottom_need_backward_[layer_id][bottom_id] = false; - } - if (!bottom_need_backward_[layer_id][bottom_id]) { - const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - blobs_skip_backp.insert(blob_name); - } - } - } - // Handle force_backward if needed. - if (param.force_backward()) { - for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { - layer_need_backward_[layer_id] = true; - for (int bottom_id = 0; - bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { - bottom_need_backward_[layer_id][bottom_id] = - bottom_need_backward_[layer_id][bottom_id] || - layers_[layer_id]->AllowForceBackward(bottom_id); - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || - bottom_need_backward_[layer_id][bottom_id]; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - layers_[layer_id]->set_param_propagate_down(param_id, true); - } - } - } - // In the end, all remaining blobs are considered output blobs. - for (set::iterator it = available_blobs.begin(); - it != available_blobs.end(); ++it) { - LOG(INFO) << "This network produces output " << *it; - net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); - net_output_blob_indices_.push_back(blob_name_to_idx[*it]); - } - for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { - blob_names_index_[blob_names_[blob_id]] = blob_id; - } - for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { - layer_names_index_[layer_names_[layer_id]] = layer_id; - } - GetLearningRateAndWeightDecay(); - debug_info_ = param.debug_info(); - LOG(INFO) << "Network initialization done."; - LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + // Set phase from the state. + phase_ = in_param.state().phase(); + // Filter layers based on their include/exclude rules and + // the current NetState. + NetParameter filtered_param; + FilterNet(in_param, &filtered_param); + LOG(INFO) << "Initializing net from parameters: " << std::endl + << filtered_param.DebugString(); + // Create a copy of filtered_param with splits added where necessary. + NetParameter param; + InsertSplits(filtered_param, ¶m); + // Basically, build all the layers and set up their connections. + name_ = param.name(); + map blob_name_to_idx; + set < string > available_blobs; + CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) + << "Must specify either input_shape OR deprecated input_dim, not both."; + if (param.input_dim_size() > 0) { + // Deprecated 4D dimensions. + CHECK_EQ(param.input_size() * 4, param.input_dim_size()) + << "Incorrect input blob dimension specifications."; + } else { + CHECK_EQ(param.input_size(), param.input_shape_size()) + << "Exactly one input_shape must be specified per input."; + } + memory_used_ = 0; + // set the input blobs + for (int input_id = 0; input_id < param.input_size(); ++input_id) { + const int layer_id = -1; // inputs have fake layer ID -1 + AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + // For each layer, set up its input and output + bottom_vecs_.resize(param.layer_size()); + top_vecs_.resize(param.layer_size()); + bottom_id_vecs_.resize(param.layer_size()); + param_id_vecs_.resize(param.layer_size()); + top_id_vecs_.resize(param.layer_size()); + bottom_need_backward_.resize(param.layer_size()); + for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { + // Inherit phase from net if unset. + if (!param.layer(layer_id).has_phase()) { + param.mutable_layer(layer_id)->set_phase(phase_); + } + // Setup layer. + const LayerParameter& layer_param = param.layer(layer_id); + if (layer_param.propagate_down_size() > 0) { + CHECK_EQ(layer_param.propagate_down_size(), layer_param.bottom_size()) + << "propagate_down param must be specified " + << "either 0 or bottom_size times "; + } + layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param)); + layer_names_.push_back(layer_param.name()); + LOG(INFO) << "Creating Layer " << layer_param.name(); + bool need_backward = false; + + // Figure out this layer's input and output + for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); + ++bottom_id) { + const int blob_id = AppendBottom(param, layer_id, bottom_id, + &available_blobs, &blob_name_to_idx); + // If a blob needs backward, this layer should provide it. + need_backward |= blob_need_backward_[blob_id]; + } + int num_top = layer_param.top_size(); + for (int top_id = 0; top_id < num_top; ++top_id) { + AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); + } + // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter + // specified fewer than the required number (as specified by + // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. + Layer < Dtype > *layer = layers_[layer_id].get(); + if (layer->AutoTopBlobs()) { + const int needed_num_top = std::max(layer->MinTopBlobs(), + layer->ExactNumTopBlobs()); + for (; num_top < needed_num_top; ++num_top) { + // Add "anonymous" top blobs -- do not modify available_blobs or + // blob_name_to_idx as we don't want these blobs to be usable as input + // to other layers. + AppendTop(param, layer_id, num_top, NULL, NULL); + } + } + // After this layer is connected, set it up. + LOG(INFO) << "Setting up " << layer_names_[layer_id]; + layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) { + blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0)); + } + blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id); + LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string(); + if (layer->loss(top_id)) { + LOG(INFO) << " with loss weight " << layer->loss(top_id); + } + memory_used_ += top_vecs_[layer_id][top_id]->count(); + } + DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + const int param_size = layer_param.param_size(); + const int num_param_blobs = layers_[layer_id]->blobs().size(); + CHECK_LE(param_size, num_param_blobs) + << "Too many params specified for layer " << layer_param.name(); + ParamSpec default_param_spec; + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + const ParamSpec* param_spec = + (param_id < param_size) ? + &layer_param.param(param_id) : &default_param_spec; + const bool param_need_backward = param_spec->lr_mult() > 0; + need_backward |= param_need_backward; + layers_[layer_id]->set_param_propagate_down(param_id, + param_need_backward); + } + for (int param_id = 0; param_id < num_param_blobs; ++param_id) { + AppendParam(param, layer_id, param_id); + } + // Finally, set the backward flag + layer_need_backward_.push_back(need_backward); + if (need_backward) { + for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) { + blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true; + } + } + } + // Go through the net backwards to determine which blobs contribute to the + // loss. We can skip backward computation for blobs that don't contribute + // to the loss. + // Also checks if all bottom blobs don't need backward computation (possible + // because the skip_propagate_down param) and so we can skip bacward + // computation for the entire layer + set < string > blobs_under_loss; + set < string > blobs_skip_backp; + for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { + bool layer_contributes_loss = false; + bool layer_skip_propagate_down = true; + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + if (layers_[layer_id]->loss(top_id) + || (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { + layer_contributes_loss = true; + } + if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { + layer_skip_propagate_down = false; + } + if (layer_contributes_loss && !layer_skip_propagate_down) + break; + } + // If this layer can skip backward computation, also all his bottom blobs + // don't need backpropagation + if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { + layer_need_backward_[layer_id] = false; + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = false; + } + } + if (!layer_contributes_loss) { + layer_need_backward_[layer_id] = false; + } + if (layer_need_backward_[layer_id]) { + LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; + } else { + LOG(INFO) << layer_names_[layer_id] + << " does not need backward computation."; + } + for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); + ++bottom_id) { + if (layer_contributes_loss) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_under_loss.insert(blob_name); + } else { + bottom_need_backward_[layer_id][bottom_id] = false; + } + if (!bottom_need_backward_[layer_id][bottom_id]) { + const string& blob_name = + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blobs_skip_backp.insert(blob_name); + } + } + } + // Handle force_backward if needed. + if (param.force_backward()) { + for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { + layer_need_backward_[layer_id] = true; + for (int bottom_id = 0; + bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { + bottom_need_backward_[layer_id][bottom_id] = + bottom_need_backward_[layer_id][bottom_id] + || layers_[layer_id]->AllowForceBackward(bottom_id); + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] + || bottom_need_backward_[layer_id][bottom_id]; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + layers_[layer_id]->set_param_propagate_down(param_id, true); + } + } + } + // In the end, all remaining blobs are considered output blobs. + for (set::iterator it = available_blobs.begin(); + it != available_blobs.end(); ++it) { + LOG(INFO) << "This network produces output " << *it; + net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); + net_output_blob_indices_.push_back(blob_name_to_idx[*it]); + } + for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) { + blob_names_index_[blob_names_[blob_id]] = blob_id; + } + for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { + layer_names_index_[layer_names_[layer_id]] = layer_id; + } + GetLearningRateAndWeightDecay(); + debug_info_ = param.debug_info(); + LOG(INFO) << "Network initialization done."; + LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); } template void Net::FilterNet(const NetParameter& param, - NetParameter* param_filtered) { - NetState net_state(param.state()); - param_filtered->CopyFrom(param); - param_filtered->clear_layer(); - for (int i = 0; i < param.layer_size(); ++i) { - const LayerParameter& layer_param = param.layer(i); - const string& layer_name = layer_param.name(); - CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) - << "Specify either include rules or exclude rules; not both."; - // If no include rules are specified, the layer is included by default and - // only excluded if it meets one of the exclude rules. - bool layer_included = (layer_param.include_size() == 0); - for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { - layer_included = false; - } - } - for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { - if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { - layer_included = true; - } - } - if (layer_included) { - param_filtered->add_layer()->CopyFrom(layer_param); - } - } -} - -template -bool Net::StateMeetsRule(const NetState& state, - const NetStateRule& rule, const string& layer_name) { - // Check whether the rule is broken due to phase. - if (rule.has_phase()) { - if (rule.phase() != state.phase()) { - LOG(INFO) << "The NetState phase (" << state.phase() - << ") differed from the phase (" << rule.phase() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to min level. - if (rule.has_min_level()) { - if (state.level() < rule.min_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the min_level (" << rule.min_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to max level. - if (rule.has_max_level()) { - if (state.level() > rule.max_level()) { - LOG(INFO) << "The NetState level (" << state.level() - << ") is above the max_level (" << rule.max_level() - << ") specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to stage. The NetState must - // contain ALL of the rule's stages to meet it. - for (int i = 0; i < rule.stage_size(); ++i) { - // Check that the NetState contains the rule's ith stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.stage(i) == state.stage(j)) { - has_stage = true; - } - } - if (!has_stage) { - LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - // Check whether the rule is broken due to not_stage. The NetState must - // contain NONE of the rule's not_stages to meet it. - for (int i = 0; i < rule.not_stage_size(); ++i) { - // Check that the NetState contains the rule's ith not_stage. - bool has_stage = false; - for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.not_stage(i) == state.stage(j)) { - has_stage = true; - } - } - if (has_stage) { - LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; - return false; - } - } - return true; + NetParameter* param_filtered) { + NetState net_state(param.state()); + param_filtered->CopyFrom(param); + param_filtered->clear_layer(); + for (int i = 0; i < param.layer_size(); ++i) { + const LayerParameter& layer_param = param.layer(i); + const string& layer_name = layer_param.name(); + CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) + << "Specify either include rules or exclude rules; not both."; + // If no include rules are specified, the layer is included by default and + // only excluded if it meets one of the exclude rules. + bool layer_included = (layer_param.include_size() == 0); + for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) { + layer_included = false; + } + } + for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) { + if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) { + layer_included = true; + } + } + if (layer_included) { + param_filtered->add_layer()->CopyFrom(layer_param); + } + } +} + +template +bool Net::StateMeetsRule(const NetState& state, const NetStateRule& rule, + const string& layer_name) { + // Check whether the rule is broken due to phase. + if (rule.has_phase()) { + if (rule.phase() != state.phase()) { + LOG(INFO) << "The NetState phase (" << state.phase() + << ") differed from the phase (" << rule.phase() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to min level. + if (rule.has_min_level()) { + if (state.level() < rule.min_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the min_level (" << rule.min_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to max level. + if (rule.has_max_level()) { + if (state.level() > rule.max_level()) { + LOG(INFO) << "The NetState level (" << state.level() + << ") is above the max_level (" << rule.max_level() + << ") specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to stage. The NetState must + // contain ALL of the rule's stages to meet it. + for (int i = 0; i < rule.stage_size(); ++i) { + // Check that the NetState contains the rule's ith stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.stage(i) == state.stage(j)) { + has_stage = true; + } + } + if (!has_stage) { + LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + // Check whether the rule is broken due to not_stage. The NetState must + // contain NONE of the rule's not_stages to meet it. + for (int i = 0; i < rule.not_stage_size(); ++i) { + // Check that the NetState contains the rule's ith not_stage. + bool has_stage = false; + for (int j = 0; !has_stage && j < state.stage_size(); ++j) { + if (rule.not_stage(i) == state.stage(j)) { + has_stage = true; + } + } + if (has_stage) { + LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) + << "' specified by a rule in layer " << layer_name; + return false; + } + } + return true; } // Helper for Net::Init: add a new input or top blob to the net. (Inputs have // layer_id == -1, tops have layer_id >= 0.) template void Net::AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx) { - shared_ptr < LayerParameter - > layer_param( - (layer_id >= 0) ? - (new LayerParameter(param.layer(layer_id))) : - NULL); - const string& blob_name = - layer_param ? - (layer_param->top_size() > top_id ? - layer_param->top(top_id) : - "(automatic)") : - param.input(top_id); - // Check if we are doing in-place computation - if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && - blob_name == layer_param->bottom(top_id)) { - // In-place computation - LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; - top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); - top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); - } else if (blob_name_to_idx && - blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { - // If we are not doing in-place computation but have duplicated blobs, - // raise an error. - LOG(FATAL) << "Duplicate blobs produced by multiple sources."; - } else { - // Normal output. - if (layer_param) { - LOG(INFO) << layer_param->name() << " -> " << blob_name; - } else { - LOG(INFO) << "Input " << top_id << " -> " << blob_name; - } - shared_ptr < Blob > blob_pointer(new Blob()); - const int blob_id = blobs_.size(); - blobs_.push_back(blob_pointer); - blob_names_.push_back(blob_name); - blob_need_backward_.push_back(false); - if (blob_name_to_idx) { - (*blob_name_to_idx)[blob_name] = blob_id; - } - if (layer_id == -1) { - // Set the (explicitly specified) dimensions of the input blob. - if (param.input_dim_size() > 0) { - blob_pointer->Reshape(param.input_dim(top_id * 4), - param.input_dim(top_id * 4 + 1), - param.input_dim(top_id * 4 + 2), - param.input_dim(top_id * 4 + 3)); - } else { - blob_pointer->Reshape(param.input_shape(top_id)); - } - net_input_blob_indices_.push_back(blob_id); - net_input_blobs_.push_back(blob_pointer.get()); - } else { - top_id_vecs_[layer_id].push_back(blob_id); - top_vecs_[layer_id].push_back(blob_pointer.get()); - } - } - if (available_blobs) { - available_blobs->insert(blob_name); - } + const int top_id, set* available_blobs, + map* blob_name_to_idx) { + shared_ptr < LayerParameter + > layer_param( + (layer_id >= 0) ? (new LayerParameter(param.layer(layer_id))) : NULL); + const string& blob_name = + layer_param ? + (layer_param->top_size() > top_id ? + layer_param->top(top_id) : "(automatic)") : + param.input(top_id); + // Check if we are doing in-place computation + if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id + && blob_name == layer_param->bottom(top_id)) { + // In-place computation + LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; + top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); + top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); + } else if (blob_name_to_idx + && blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { + // If we are not doing in-place computation but have duplicated blobs, + // raise an error. + LOG(FATAL) << "Duplicate blobs produced by multiple sources."; + } else { + // Normal output. + if (layer_param) { + LOG(INFO) << layer_param->name() << " -> " << blob_name; + } else { + LOG(INFO) << "Input " << top_id << " -> " << blob_name; + } + shared_ptr < Blob > blob_pointer(new Blob()); + const int blob_id = blobs_.size(); + blobs_.push_back(blob_pointer); + blob_names_.push_back(blob_name); + blob_need_backward_.push_back(false); + if (blob_name_to_idx) { + (*blob_name_to_idx)[blob_name] = blob_id; + } + if (layer_id == -1) { + // Set the (explicitly specified) dimensions of the input blob. + if (param.input_dim_size() > 0) { + blob_pointer->Reshape(param.input_dim(top_id * 4), + param.input_dim(top_id * 4 + 1), param.input_dim(top_id * 4 + 2), + param.input_dim(top_id * 4 + 3)); + } else { + blob_pointer->Reshape(param.input_shape(top_id)); + } + net_input_blob_indices_.push_back(blob_id); + net_input_blobs_.push_back(blob_pointer.get()); + } else { + top_id_vecs_[layer_id].push_back(blob_id); + top_vecs_[layer_id].push_back(blob_pointer.get()); + } + } + if (available_blobs) { + available_blobs->insert(blob_name); + } } // Helper for Net::Init: add a new bottom blob to the net. template int Net::AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx) { - const LayerParameter& layer_param = param.layer(layer_id); - const string& blob_name = layer_param.bottom(bottom_id); - if (available_blobs->find(blob_name) == available_blobs->end()) { - LOG(FATAL) << "Unknown blob input " << blob_name - << " (at index " << bottom_id << ") to layer " << layer_id; - } - const int blob_id = (*blob_name_to_idx)[blob_name]; - LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; - bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); - bottom_id_vecs_[layer_id].push_back(blob_id); - available_blobs->erase(blob_name); - bool propagate_down = true; - // Check if the backpropagation on bottom_id should be skipped - if (layer_param.propagate_down_size() > 0) - propagate_down = layer_param.propagate_down(bottom_id); - const bool need_backward = blob_need_backward_[blob_id] && - propagate_down; - bottom_need_backward_[layer_id].push_back(need_backward); - return blob_id; + const int bottom_id, set* available_blobs, + map* blob_name_to_idx) { + const LayerParameter& layer_param = param.layer(layer_id); + const string& blob_name = layer_param.bottom(bottom_id); + if (available_blobs->find(blob_name) == available_blobs->end()) { + LOG(FATAL) << "Unknown blob input " << blob_name << " (at index " + << bottom_id << ") to layer " << layer_id; + } + const int blob_id = (*blob_name_to_idx)[blob_name]; + LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; + bottom_vecs_[layer_id].push_back(blobs_[blob_id].get()); + bottom_id_vecs_[layer_id].push_back(blob_id); + available_blobs->erase(blob_name); + bool propagate_down = true; + // Check if the backpropagation on bottom_id should be skipped + if (layer_param.propagate_down_size() > 0) + propagate_down = layer_param.propagate_down(bottom_id); + const bool need_backward = blob_need_backward_[blob_id] && propagate_down; + bottom_need_backward_[layer_id].push_back(need_backward); + return blob_id; } template void Net::AppendParam(const NetParameter& param, const int layer_id, - const int param_id) { - const LayerParameter& layer_param = layers_[layer_id]->layer_param(); - const int param_size = layer_param.param_size(); - string param_name = - (param_size > param_id) ? layer_param.param(param_id).name() : ""; - if (param_name.size()) { - param_display_names_.push_back(param_name); - } else { - ostringstream param_display_name; - param_display_name << param_id; - param_display_names_.push_back(param_display_name.str()); - } - const int net_param_id = params_.size(); - params_.push_back(layers_[layer_id]->blobs()[param_id]); - param_id_vecs_[layer_id].push_back(net_param_id); - param_layer_indices_.push_back(make_pair(layer_id, param_id)); - if (!param_size || !param_name.size() || (param_name.size() && - param_names_index_.find(param_name) == param_names_index_.end())) { - // This layer "owns" this parameter blob -- it is either anonymous - // (i.e., not given a param_name) or explicitly given a name that we - // haven't already seen. - param_owners_.push_back(-1); - if (param_name.size()) { - param_names_index_[param_name] = net_param_id; - } - } else { - // Named param blob with name we've seen before: share params - const int owner_net_param_id = param_names_index_[param_name]; - param_owners_.push_back(owner_net_param_id); - const pair& owner_index = - param_layer_indices_[owner_net_param_id]; - const int owner_layer_id = owner_index.first; - const int owner_param_id = owner_index.second; - LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; - Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get(); - Blob < Dtype > *owner_blob = - layers_[owner_layer_id]->blobs()[owner_param_id].get(); - const int param_size = layer_param.param_size(); - if (param_size > param_id && (layer_param.param(param_id).share_mode() == - ParamSpec_DimCheckMode_PERMISSIVE)) { - // Permissive dimension checking -- only check counts are the same. - CHECK_EQ(this_blob->count(), owner_blob->count()) - << "Shared parameter blobs must have the same count."; - } else { - // Strict dimension checking -- all dims must be the same. - CHECK(this_blob->shape() == owner_blob->shape()); - } - layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); - } + const int param_id) { + const LayerParameter& layer_param = layers_[layer_id]->layer_param(); + const int param_size = layer_param.param_size(); + string param_name = + (param_size > param_id) ? layer_param.param(param_id).name() : ""; + if (param_name.size()) { + param_display_names_.push_back(param_name); + } else { + ostringstream param_display_name; + param_display_name << param_id; + param_display_names_.push_back(param_display_name.str()); + } + const int net_param_id = params_.size(); + params_.push_back(layers_[layer_id]->blobs()[param_id]); + param_id_vecs_[layer_id].push_back(net_param_id); + param_layer_indices_.push_back(make_pair(layer_id, param_id)); + if (!param_size || !param_name.size() + || (param_name.size() + && param_names_index_.find(param_name) == param_names_index_.end())) { + // This layer "owns" this parameter blob -- it is either anonymous + // (i.e., not given a param_name) or explicitly given a name that we + // haven't already seen. + param_owners_.push_back(-1); + if (param_name.size()) { + param_names_index_[param_name] = net_param_id; + } + } else { + // Named param blob with name we've seen before: share params + const int owner_net_param_id = param_names_index_[param_name]; + param_owners_.push_back(owner_net_param_id); + const pair& owner_index = param_layer_indices_[owner_net_param_id]; + const int owner_layer_id = owner_index.first; + const int owner_param_id = owner_index.second; + LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " + << "layer '" << layer_names_[owner_layer_id] << "', param " << "index " + << owner_param_id; + Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get(); + Blob < Dtype > *owner_blob = + layers_[owner_layer_id]->blobs()[owner_param_id].get(); + const int param_size = layer_param.param_size(); + if (param_size > param_id + && (layer_param.param(param_id).share_mode() + == ParamSpec_DimCheckMode_PERMISSIVE)) { + // Permissive dimension checking -- only check counts are the same. + CHECK_EQ(this_blob->count(), owner_blob->count()) + << "Shared parameter blobs must have the same count."; + } else { + // Strict dimension checking -- all dims must be the same. + CHECK(this_blob->shape() == owner_blob->shape()); + } + layers_[layer_id]->blobs()[param_id]->ShareData( + *layers_[owner_layer_id]->blobs()[owner_param_id]); + } } template void Net::GetLearningRateAndWeightDecay() { - LOG(INFO) << "Collecting Learning Rate and Weight Decay."; - ParamSpec default_param_spec; - for (int i = 0; i < layers_.size(); ++i) { - vector < shared_ptr > > &layer_blobs = layers_[i]->blobs(); - for (int j = 0; j < layer_blobs.size(); ++j) { - const ParamSpec* param_spec = - (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; - params_lr_.push_back(param_spec->lr_mult()); - params_weight_decay_.push_back(param_spec->decay_mult()); - } - } + LOG(INFO) << "Collecting Learning Rate and Weight Decay."; + ParamSpec default_param_spec; + for (int i = 0; i < layers_.size(); ++i) { + vector < shared_ptr > > &layer_blobs = layers_[i]->blobs(); + for (int j = 0; j < layer_blobs.size(); ++j) { + const ParamSpec* param_spec = + (layers_[i]->layer_param().param_size() > j) ? + &layers_[i]->layer_param().param(j) : &default_param_spec; + params_lr_.push_back(param_spec->lr_mult()); + params_weight_decay_.push_back(param_spec->decay_mult()); + } + } } template Dtype Net::ForwardFromTo(int start, int end) { - CHECK_GE(start, 0); - CHECK_LT(end, layers_.size()); - Dtype loss = 0; - if (debug_info_) { - for (int i = 0; i < net_input_blobs_.size(); ++i) { - InputDebugInfo(i); - } - } - - CPUTimer forward_timer; - CPUTimer layer_timer; - forward_timer.Start(); - - for (int i = start; i <= end; ++i) { - layer_timer.Start(); - Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); - loss += layer_loss; - if (debug_info_) { - ForwardDebugInfo(i); - } - clFinish(amdDevice.CommandQueue); - layer_timer.Stop(); - printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), - layer_timer.MilliSeconds()); - } - - forward_timer.Stop(); - printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); - - return loss; + CHECK_GE(start, 0); + CHECK_LT(end, layers_.size()); + Dtype loss = 0; + if (debug_info_) { + for (int i = 0; i < net_input_blobs_.size(); ++i) { + InputDebugInfo(i); + } + } + + CPUTimer forward_timer; + CPUTimer layer_timer; + forward_timer.Start(); + + for (int i = start; i <= end; ++i) { + layer_timer.Start(); + Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); + loss += layer_loss; + if (debug_info_) { + ForwardDebugInfo(i); + } + clFinish(amdDevice.CommandQueue); + layer_timer.Stop(); + printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); + } + + forward_timer.Stop(); + printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); + + return loss; } template Dtype Net::ForwardFrom(int start) { - return ForwardFromTo(start, layers_.size() - 1); + return ForwardFromTo(start, layers_.size() - 1); } template Dtype Net::ForwardTo(int end) { - return ForwardFromTo(0, end); + return ForwardFromTo(0, end); } template const vector*>& Net::ForwardPrefilled(Dtype* loss) { - if (loss != NULL) { - *loss = ForwardFromTo(0, layers_.size() - 1); - } else { - ForwardFromTo(0, layers_.size() - 1); - } - return net_output_blobs_; + if (loss != NULL) { + *loss = ForwardFromTo(0, layers_.size() - 1); + } else { + ForwardFromTo(0, layers_.size() - 1); + } + return net_output_blobs_; } template const vector*>& Net::Forward( - const vector*> & bottom, Dtype* loss) { - // Copy bottom to internal bottom - for (int i = 0; i < bottom.size(); ++i) { - net_input_blobs_[i]->CopyFrom(*bottom[i]); - } - return ForwardPrefilled(loss); + const vector*> & bottom, Dtype* loss) { + // Copy bottom to internal bottom + for (int i = 0; i < bottom.size(); ++i) { + net_input_blobs_[i]->CopyFrom(*bottom[i]); + } + return ForwardPrefilled(loss); } template string Net::Forward(const string& input_blob_protos, Dtype* loss) { - BlobProtoVector blob_proto_vec; - if (net_input_blobs_.size()) { - blob_proto_vec.ParseFromString(input_blob_protos); - CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) - << "Incorrect input size."; - for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { - net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); - } - } - ForwardPrefilled(loss); - blob_proto_vec.Clear(); - for (int i = 0; i < net_output_blobs_.size(); ++i) { - net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); - } - string output; - blob_proto_vec.SerializeToString(&output); - return output; + BlobProtoVector blob_proto_vec; + if (net_input_blobs_.size()) { + blob_proto_vec.ParseFromString(input_blob_protos); + CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size()) + << "Incorrect input size."; + for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) { + net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i)); + } + } + ForwardPrefilled(loss); + blob_proto_vec.Clear(); + for (int i = 0; i < net_output_blobs_.size(); ++i) { + net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs()); + } + string output; + blob_proto_vec.SerializeToString(&output); + return output; } template void Net::BackwardFromTo(int start, int end) { - CHECK_GE(end, 0); - CHECK_LT(start, layers_.size()); + CHECK_GE(end, 0); + CHECK_LT(start, layers_.size()); - CPUTimer backward_timer; - CPUTimer layer_timer; - backward_timer.Start(); + CPUTimer backward_timer; + CPUTimer layer_timer; + backward_timer.Start(); - for (int i = start; i >= end; --i) { - layer_timer.Start(); - if (layer_need_backward_[i]) { - layers_[i]->Backward( - top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); - if (debug_info_) { - BackwardDebugInfo(i); - } - clFinish(amdDevice.CommandQueue); - layer_timer.Start(); - printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), - layer_timer.MilliSeconds()); - } - } + for (int i = start; i >= end; --i) { + layer_timer.Start(); + if (layer_need_backward_[i]) { + layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i], + bottom_vecs_[i]); + if (debug_info_) { + BackwardDebugInfo(i); + } + clFinish(amdDevice.CommandQueue); + layer_timer.Start(); + printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); + } + } - backward_timer.Stop(); - printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); + backward_timer.Stop(); + printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); } template void Net::InputDebugInfo(const int input_id) { - const Blob& blob = *net_input_blobs_[input_id]; - const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; + const Blob& blob = *net_input_blobs_[input_id]; + const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " << "Input " << blob_name << " data: " + << data_abs_val_mean; } template void Net::ForwardDebugInfo(const int layer_id) { - for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { - const Blob& blob = *top_vecs_[layer_id][top_id]; - const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << data_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const int net_param_id = param_id_vecs_[layer_id][param_id]; - const string& blob_name = param_display_names_[net_param_id]; - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name - << " data: " << data_abs_val_mean; - } + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const Blob& blob = *top_vecs_[layer_id][top_id]; + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " << "Layer " << layer_names_[layer_id] + << ", top blob " << blob_name << " data: " << data_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const int net_param_id = param_id_vecs_[layer_id][param_id]; + const string& blob_name = param_display_names_[net_param_id]; + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Forward] " << "Layer " << layer_names_[layer_id] + << ", param blob " << blob_name << " data: " << data_abs_val_mean; + } } template void Net::BackwardDebugInfo(const int layer_id) { - const vector*>& bottom_vec = bottom_vecs_[layer_id]; - for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { - if (!bottom_need_backward_[layer_id][bottom_id]) { - continue; - } - const Blob& blob = *bottom_vec[bottom_id]; - const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; - } - for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - if (!layers_[layer_id]->param_propagate_down(param_id)) { - continue; - } - const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; - } + const vector*>& bottom_vec = bottom_vecs_[layer_id]; + for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { + if (!bottom_need_backward_[layer_id][bottom_id]) { + continue; + } + const Blob& blob = *bottom_vec[bottom_id]; + const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] + << ", bottom blob " << blob_name << " diff: " << diff_abs_val_mean; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + if (!layers_[layer_id]->param_propagate_down(param_id)) { + continue; + } + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] + << ", param blob " << param_id << " diff: " << diff_abs_val_mean; + } } template void Net::UpdateDebugInfo(const int param_id) { - const Blob& blob = *params_[param_id]; - const int param_owner = param_owners_[param_id]; - const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; - const string& param_display_name = param_display_names_[param_id]; - const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - if (param_owner < 0) { - const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; - } else { - const string& owner_layer_name = - layer_names_[param_layer_indices_[param_owner].first]; - LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " - << "param " << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << diff_abs_val_mean; - } + const Blob& blob = *params_[param_id]; + const int param_owner = param_owners_[param_id]; + const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; + const string& param_display_name = param_display_names_[param_id]; + const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); + if (param_owner < 0) { + const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); + LOG(INFO) << " [Update] Layer " << layer_name << ", param " + << param_display_name << " data: " << data_abs_val_mean << "; diff: " + << diff_abs_val_mean; + } else { + const string& owner_layer_name = + layer_names_[param_layer_indices_[param_owner].first]; + LOG(INFO) << " [Update] Layer " << layer_name << ", param blob " + << param_display_name << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << diff_abs_val_mean; + } } template void Net::ShareTrainedLayersWith(const Net* other) { - int num_source_layers = other->layers().size(); - for (int i = 0; i < num_source_layers; ++i) { - Layer < Dtype > *source_layer = other->layers()[i].get(); - const string& source_layer_name = other->layer_names()[i]; - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector < shared_ptr > > &target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - Blob < Dtype > *source_blob = source_layer->blobs()[j].get(); - CHECK(target_blobs[j]->shape() == source_blob->shape()); - target_blobs[j]->ShareData(*source_blob); - } - } + int num_source_layers = other->layers().size(); + for (int i = 0; i < num_source_layers; ++i) { + Layer < Dtype > *source_layer = other->layers()[i].get(); + const string& source_layer_name = other->layer_names()[i]; + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() + && layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector < shared_ptr > > &target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + Blob < Dtype > *source_blob = source_layer->blobs()[j].get(); + CHECK(target_blobs[j]->shape() == source_blob->shape()); + target_blobs[j]->ShareData(*source_blob); + } + } } template void Net::BackwardFrom(int start) { - BackwardFromTo(start, 0); + BackwardFromTo(start, 0); } template void Net::BackwardTo(int end) { - BackwardFromTo(layers_.size() - 1, end); + BackwardFromTo(layers_.size() - 1, end); } template void Net::Backward() { - BackwardFromTo(layers_.size() - 1, 0); - if (debug_info_) { - Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { - continue; - } - asum_data += params_[i]->asum_data(); - asum_diff += params_[i]->asum_diff(); - sumsq_data += params_[i]->sumsq_data(); - sumsq_diff += params_[i]->sumsq_diff(); - } - const Dtype l2norm_data = std::sqrt(sumsq_data); - const Dtype l2norm_diff = std::sqrt(sumsq_diff); - LOG(ERROR) << " [Backward] All net params (data, diff): " - << "L1 norm = (" << asum_data << ", " << asum_diff << "); " - << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; - } + BackwardFromTo(layers_.size() - 1, 0); + if (debug_info_) { + Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { + continue; + } + asum_data += params_[i]->asum_data(); + asum_diff += params_[i]->asum_diff(); + sumsq_data += params_[i]->sumsq_data(); + sumsq_diff += params_[i]->sumsq_diff(); + } + const Dtype l2norm_data = std::sqrt(sumsq_data); + const Dtype l2norm_diff = std::sqrt(sumsq_diff); + LOG(ERROR) << " [Backward] All net params (data, diff): " + << "L1 norm = (" << asum_data << ", " << asum_diff << "); " + << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")"; + } } template void Net::Reshape() { - for (int i = 0; i < layers_.size(); ++i) { - layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); - } + for (int i = 0; i < layers_.size(); ++i) { + layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]); + } } template void Net::CopyTrainedLayersFrom(const NetParameter& param) { - int num_source_layers = param.layer_size(); - for (int i = 0; i < num_source_layers; ++i) { - const LayerParameter& source_layer = param.layer(i); - const string& source_layer_name = source_layer.name(); - int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { - ++target_layer_id; - } - if (target_layer_id == layer_names_.size()) { - DLOG(INFO) << "Ignoring source layer " << source_layer_name; - continue; - } - DLOG(INFO) << "Copying source layer " << source_layer_name; - vector < shared_ptr > > &target_blobs = - layers_[target_layer_id]->blobs(); - CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) - << "Incompatible number of blobs for layer " << source_layer_name; - for (int j = 0; j < target_blobs.size(); ++j) { - const bool kReshape = false; - target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); - } - } + int num_source_layers = param.layer_size(); + for (int i = 0; i < num_source_layers; ++i) { + const LayerParameter& source_layer = param.layer(i); + const string& source_layer_name = source_layer.name(); + int target_layer_id = 0; + while (target_layer_id != layer_names_.size() + && layer_names_[target_layer_id] != source_layer_name) { + ++target_layer_id; + } + if (target_layer_id == layer_names_.size()) { + DLOG(INFO) << "Ignoring source layer " << source_layer_name; + continue; + } + DLOG(INFO) << "Copying source layer " << source_layer_name; + vector < shared_ptr > > &target_blobs = + layers_[target_layer_id]->blobs(); + CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) + << "Incompatible number of blobs for layer " << source_layer_name; + for (int j = 0; j < target_blobs.size(); ++j) { + const bool kReshape = false; + target_blobs[j]->FromProto(source_layer.blobs(j), kReshape); + } + } } template void Net::CopyTrainedLayersFrom(const string trained_filename) { - NetParameter param; - ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); - CopyTrainedLayersFrom(param); + NetParameter param; + ReadNetParamsFromBinaryFileOrDie(trained_filename, ¶m); + CopyTrainedLayersFrom(param); } template void Net::ToProto(NetParameter* param, bool write_diff) const { - param->Clear(); - param->set_name(name_); - // Add bottom and top - for (int i = 0; i < net_input_blob_indices_.size(); ++i) { - param->add_input(blob_names_[net_input_blob_indices_[i]]); - } - DLOG(INFO) << "Serializing " << layers_.size() << " layers"; - for (int i = 0; i < layers_.size(); ++i) { - LayerParameter* layer_param = param->add_layer(); - for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { - layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); - } - for (int j = 0; j < top_id_vecs_[i].size(); ++j) { - layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); - } - layers_[i]->ToProto(layer_param, write_diff); - } + param->Clear(); + param->set_name(name_); + // Add bottom and top + for (int i = 0; i < net_input_blob_indices_.size(); ++i) { + param->add_input(blob_names_[net_input_blob_indices_[i]]); + } + DLOG(INFO) << "Serializing " << layers_.size() << " layers"; + for (int i = 0; i < layers_.size(); ++i) { + LayerParameter* layer_param = param->add_layer(); + for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) { + layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]); + } + for (int j = 0; j < top_id_vecs_[i].size(); ++j) { + layer_param->add_top(blob_names_[top_id_vecs_[i][j]]); + } + layers_[i]->ToProto(layer_param, write_diff); + } } template void Net::Update() { - // First, accumulate the diffs of any shared parameters into their owner's - // diff. (Assumes that the learning rate, weight decay, etc. have already been - // accounted for in the current diff.) - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { - continue; - } - if (debug_info_) { - UpdateDebugInfo(i); - } - const int count = params_[i]->count(); - const Dtype* this_diff; - Dtype* owner_diff; - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; - case Caffe::GPU: - #ifndef CPU_ONLY - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); - caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff); + // First, accumulate the diffs of any shared parameters into their owner's + // diff. (Assumes that the learning rate, weight decay, etc. have already been + // accounted for in the current diff.) + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] < 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } + const int count = params_[i]->count(); + const Dtype* this_diff; + Dtype* owner_diff; + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + + switch (Caffe::mode()) { + case Caffe::CPU: + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + caffe_add(count, this_diff, owner_diff, owner_diff); + break; + case Caffe::GPU: +#ifndef CPU_ONLY + this_diff = params_[i]->gpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); + // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff); #else - NO_GPU; + NO_GPU; #endif - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } - } - // Now, update the owned parameters. - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { - continue; - } - if (debug_info_) { - UpdateDebugInfo(i); - } - params_[i]->Update(); - } + break; + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } + } + // Now, update the owned parameters. + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] >= 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } + params_[i]->Update(); + } } template bool Net::has_blob(const string& blob_name) const { - return blob_names_index_.find(blob_name) != blob_names_index_.end(); + return blob_names_index_.find(blob_name) != blob_names_index_.end(); } template const shared_ptr > Net::blob_by_name( - const string& blob_name) const { - shared_ptr < Blob > blob_ptr; - if (has_blob(blob_name)) { - blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; - } else { - blob_ptr.reset((Blob*) (NULL)); - LOG(WARNING) << "Unknown blob name " << blob_name; - } - return blob_ptr; + const string& blob_name) const { + shared_ptr < Blob > blob_ptr; + if (has_blob(blob_name)) { + blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; + } else { + blob_ptr.reset((Blob*) (NULL)); + LOG(WARNING) << "Unknown blob name " << blob_name; + } + return blob_ptr; } template bool Net::has_layer(const string& layer_name) const { - return layer_names_index_.find(layer_name) != layer_names_index_.end(); + return layer_names_index_.find(layer_name) != layer_names_index_.end(); } template const shared_ptr > Net::layer_by_name( - const string& layer_name) const { - shared_ptr < Layer > layer_ptr; - if (has_layer(layer_name)) { - layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; - } else { - layer_ptr.reset((Layer*) (NULL)); - LOG(WARNING) << "Unknown layer name " << layer_name; - } - return layer_ptr; + const string& layer_name) const { + shared_ptr < Layer > layer_ptr; + if (has_layer(layer_name)) { + layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; + } else { + layer_ptr.reset((Layer*) (NULL)); + LOG(WARNING) << "Unknown layer name " << layer_name; + } + return layer_ptr; } INSTANTIATE_CLASS (Net); diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl index f23ff9a3..99d04575 100644 --- a/src/caffe/ocl/bnll_layer.cl +++ b/src/caffe/ocl/bnll_layer.cl @@ -28,25 +28,25 @@ template __kernel void BNLLForward(const int n, __global const T* in, __global T* out) { - int index = get_global_id(0); - if (index < n) { - out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); - } + int index = get_global_id(0); + if (index < n) { + out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); + } } template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out); template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out); template __kernel void BNLLBackward(const int n, __global const T* in_diff, - __global const T* in_data, __global T* out_diff) { - int index = get_global_id(0); - if (index < n) { - T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); - out_diff[index] = in_diff[index] * expval / (expval + 1.); - } + __global const T* in_data, __global T* out_diff) { + int index = get_global_id(0); + if (index < n) { + T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); + out_diff[index] = in_diff[index] * expval / (expval + 1.); + } } template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff, - __global const float* in_data, __global float* out_diff); + __global const float* in_data, __global float* out_diff); template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff, - __global const double* in_data, __global double* out_diff); + __global const double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl index ba5e1f54..a9663fce 100644 --- a/src/caffe/ocl/concat_layer.cl +++ b/src/caffe/ocl/concat_layer.cl @@ -26,29 +26,29 @@ template __kernel void Concat(const int nthreads, __global const T* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global T* out_data) { - int index = get_global_id(0); - if(index < nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward == 1) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward == 1) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } } template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global float* out_data); + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, - const int forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, __global double* out_data); + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl index b6fdebc7..477f2ff4 100644 --- a/src/caffe/ocl/contrastive_loss_layer.cl +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -26,39 +26,39 @@ template __kernel void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, - __global Dtype *bottom_diff) { - int i = get_global_id(0); - if(i < count) { - int n = i / channels; // the num index, to access y and dist_sq - if (static_cast(y[n])) { // similar pairs - bottom_diff[i] = alpha * diff[i]; - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = (margin - dist_sq[n]); - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq[n]); - mdist = (margin - dist); - beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; - } - if (mdist > 0.0) { - bottom_diff[i] = beta; - } else { - bottom_diff[i] = 0; - } - } - } + const Dtype margin, const bool legacy_version, const Dtype alpha, + __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, + __global Dtype *bottom_diff) { + int i = get_global_id(0); + if(i < count) { + int n = i / channels; // the num index, to access y and dist_sq + if (static_cast(y[n])) { // similar pairs + bottom_diff[i] = alpha * diff[i]; + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = (margin - dist_sq[n]); + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq[n]); + mdist = (margin - dist); + beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; + } + if (mdist > 0.0) { + bottom_diff[i] = beta; + } else { + bottom_diff[i] = 0; + } + } + } } template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - __global const float* y, __global const float* diff, __global const float* dist_sq, - __global float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + __global const float* y, __global const float* diff, __global const float* dist_sq, + __global float *bottom_diff); template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - __global const double* y, __global const double* diff, __global const double* dist_sq, - __global double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + __global const double* y, __global const double* diff, __global const double* dist_sq, + __global double *bottom_diff); diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl index bb2fc696..230c9715 100644 --- a/src/caffe/ocl/dropout_layer.cl +++ b/src/caffe/ocl/dropout_layer.cl @@ -26,18 +26,18 @@ template __kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) { - int index = get_global_id(0); - if (index < n) - out[index] = in[index] * scale * mask[index]; + int index = get_global_id(0); + if (index < n) + out[index] = in[index] * scale * mask[index]; } template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); template __kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) { - int index = get_global_id(0); - if (index < n) - out_diff[index] = in_diff[index] * scale * mask[index]; + int index = get_global_id(0); + if (index < n) + out_diff[index] = in_diff[index] * scale * mask[index]; } template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl index 3f60a34f..88137dd7 100644 --- a/src/caffe/ocl/eltwise_layer.cl +++ b/src/caffe/ocl/eltwise_layer.cl @@ -26,48 +26,48 @@ template __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, - __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, - __global int* mask) { - int index = get_global_id(0); - if(index < nthreads) { - Dtype maxval = -FLT_MAX; - int maxidx = -1; - if (bottom_data_a[index] > bottom_data_b[index]) { - // only update for very first bottom_data blob (blob_idx == 0) - if (blob_idx == 0) { - maxval = bottom_data_a[index]; - top_data[index] = maxval; - maxidx = blob_idx; - mask[index] = maxidx; - } - } else { - maxval = bottom_data_b[index]; - top_data[index] = maxval; - maxidx = blob_idx + 1; - mask[index] = maxidx; - } - } + __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, + __global int* mask) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype maxval = -FLT_MAX; + int maxidx = -1; + if (bottom_data_a[index] > bottom_data_b[index]) { + // only update for very first bottom_data blob (blob_idx == 0) + if (blob_idx == 0) { + maxval = bottom_data_a[index]; + top_data[index] = maxval; + maxidx = blob_idx; + mask[index] = maxidx; + } + } else { + maxval = bottom_data_b[index]; + top_data[index] = maxval; + maxidx = blob_idx + 1; + mask[index] = maxidx; + } + } } template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a, - __global const float* bottom_data_b, const int blob_idx, __global float* top_data, - __global int* mask); + __global const float* bottom_data_b, const int blob_idx, __global float* top_data, + __global int* mask); template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a, - __global const double* bottom_data_b, const int blob_idx, __global double* top_data, - __global int* mask); + __global const double* bottom_data_b, const int blob_idx, __global double* top_data, + __global int* mask); template __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, - const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { - int index = get_global_id(0); - if(index < nthreads) { - Dtype gradient = 0; - if (mask[index] == blob_idx) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } + const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype gradient = 0; + if (mask[index] == blob_idx) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff, - const int blob_idx, __global const int* mask, __global float* bottom_diff); + const int blob_idx, __global const int* mask, __global float* bottom_diff); template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff, - const int blob_idx, __global const int* mask, __global double* bottom_diff); + const int blob_idx, __global const int* mask, __global double* bottom_diff); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 46248024..09f240cf 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -26,31 +26,31 @@ template __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) { - int index=get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; - if(index < n) { - int w_out=index %width_col; - index /= width_col; - int h_out=index%height_col; - int channel_in = index/height_col; - int channel_out=channel_in *ksize *ksize; - int h_in = h_out *stride-pad; - int w_in = w_out *stride-pad; - data_col +=(channel_out *height_col + h_out) *width_col + w_out; - data_im +=(channel_in * height + h_in) *width + w_in; - int i=0,j=0; - for(i=0;i= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col +=height_col *width_col; - } - } - } + int index=get_global_id(0); + data_im = data_im + img_offset; + data_col = data_col + col_offset; + if(index < n) { + int w_out=index %width_col; + index /= width_col; + int h_out=index%height_col; + int channel_in = index/height_col; + int channel_out=channel_in *ksize *ksize; + int h_in = h_out *stride-pad; + int w_in = w_out *stride-pad; + data_col +=(channel_out *height_col + h_out) *width_col + w_out; + data_im +=(channel_in * height + h_in) *width + w_in; + int i=0,j=0; + for(i=0;i= 0 && w >= 0 && h < height && w < width) + *data_col=data_im[i * width + j]; + else *data_col=0; + data_col +=height_col *width_col; + } + } + } } template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); @@ -59,34 +59,34 @@ template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const template __kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { - int index = get_global_id(0); + int index = get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; + data_im = data_im + img_offset; + data_col = data_col + col_offset; - int x_out = index % width_col; - int y_out = (index / width_col) % height_col; - int channel_in = (index / width_col / height_col) % channels; - int channel_out = channel_in * ksize * ksize; - int im_id = index / width_col / height_col / channels; + int x_out = index % width_col; + int y_out = (index / width_col) % height_col; + int channel_in = (index / width_col / height_col) % channels; + int channel_out = channel_in * ksize * ksize; + int im_id = index / width_col / height_col / channels; - int y_in = y_out * stride - pad; - int x_in = x_out * stride - pad; - int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; - int offset_im = im_id * channels * height * width + channel_in * height * width; + int y_in = y_out * stride - pad; + int x_in = x_out * stride - pad; + int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; + int offset_im = im_id * channels * height * width + channel_in * height * width; - for(int k_h = 0; k_h < ksize; k_h++) { - for(int k_w = 0; k_w < ksize; k_w++) { - int x_im = x_in + k_w; - int y_im = y_in + k_h; - int index_im = y_im * width + x_im; - int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; - if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) - data_col[offset_col + index_col] = data_im[offset_im + index_im]; - else - data_col[offset_col + index_col] = 0; - } - } + for(int k_h = 0; k_h < ksize; k_h++) { + for(int k_w = 0; k_w < ksize; k_w++) { + int x_im = x_in + k_w; + int y_im = y_in + k_h; + int index_im = y_im * width + x_im; + int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) + data_col[offset_col + index_col] = data_im[offset_im + index_im]; + else + data_col[offset_col + index_col] = 0; + } + } } template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); @@ -94,150 +94,150 @@ template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_o template __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_col, const int col_offset) { - data_im = data_im + img_offset; - data_col = data_col + col_offset; + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { + data_im = data_im + img_offset; + data_col = data_col + col_offset; - int index = get_global_id(0); - if(index < n) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - __global T* data_col_ptr = data_col; - data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; - __global const T* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + i; - int w = w_in + j; - *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; - data_col_ptr += height_col * width_col; - } - } - } + int index = get_global_id(0); + if(index < n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + __global T* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + __global const T* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } } template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global float* data_col, const int col_offset); + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, - const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - const int height_col, const int width_col, __global double* data_col, const int col_offset); + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); template __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - __global T* data_im, const int img_offset) { - data_col = data_col + col_offset; - data_im = data_im + img_offset; - int index = get_global_id(0); - if(index < n) { - T val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; - int w_col_end = min(w / stride_w + 1, width_col); - int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; - int h_col_end = min(h / stride_h + 1, height_col); - // equivalent implementation - int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; - int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { + data_col = data_col + col_offset; + data_im = data_im + img_offset; + int index = get_global_id(0); + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w,const int pad_h, const int pad_w, - const int stride_h, const int stride_w,const int height_col, const int width_col, - __global float* data_im, const int img_offset); + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, - const int col_offset, const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); template __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) { - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n) { - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n) { + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); template __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n) { - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height) % channels; - int im = index / width / height / channels; - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col * optnum); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n) { + T val = 0; + int w = index % width + pad; + int h = (index / width) % height + pad; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + // compute the start and end of the output + int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; + int w_col_end = min(w / stride + 1, width_col); + int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; + int h_col_end = min(h / stride + 1, height_col); + // equivalent implementation + int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride * height_col * width_col * optnum); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); @@ -245,46 +245,46 @@ template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_o template __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) { - int index = get_global_id(0); - data_opt = data_opt + opt_offset; - data_im = data_im + im_offset; - if(index < n) { - int w = index % width; - int h = (index / width) % height; - int c = index / (width * height) % channels; - int im = index / width / height / channels; + int index = get_global_id(0); + data_opt = data_opt + opt_offset; + data_im = data_im + im_offset; + if(index < n) { + int w = index % width; + int h = (index / width) % height; + int c = index / (width * height) % channels; + int im = index / width / height / channels; - int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; - data_opt[opt_index] = data_im[index]; - } + int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; + data_opt[opt_index] = data_im[index]; + } } template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); template __kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) { - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidyy = gidy; - int index = gidy / height; - int offset = index * width * height; - gidy = gidy % height; - if( gidx < width && gidyy < height * optnum ) - dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; } template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); template __kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) { - int gidx = get_global_id(0); - int index; - index = (optnum==1) ? 0: gidx % optnum; - dst = dst + top_offset; // now we point at (*top)[n] - int offset = gidx / optnum; - int i = 0; - for(i = 0; i < width; i++) - dst[(index * height + offset)* width + i] = src[gidx * width + i]; + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; } template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl index e9938966..67eed4ae 100644 --- a/src/caffe/ocl/lrn_layer.cl +++ b/src/caffe/ocl/lrn_layer.cl @@ -26,113 +26,113 @@ template __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) - out[index] = in[index] * pow(scale[index], negative_beta); + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) + out[index] = in[index] * pow(scale[index], negative_beta); } template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); template __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - in = in + offset; - scale = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in[head * step] * in[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in[head * step] * in[head * step]; - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in[(head - size) * step] - * in[(head - size) * step]; - } - scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + in = in + offset; + scale = scale + offset; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + } } template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); template __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - bottom_data += offset; - top_data += offset; - scale += offset; - top_diff += offset; - bottom_diff += offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - T accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - } - bottom_diff[(head - post_pad) * step] = - top_diff[(head - post_pad) * step] - * pow(scale[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - } + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_ratio = 0; + // accumulate values + while (head < post_pad && head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + } } template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl index 786ddc16..49a1413a 100644 --- a/src/caffe/ocl/pooling_layer.cl +++ b/src/caffe/ocl/pooling_layer.cl @@ -26,68 +26,68 @@ template __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index += tmp) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - T maxval = -FLT_MAX; - int maxidx = -1; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_data[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_data[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_data[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_data[maxidx]; + } + } + } + top_data[index] = maxval; + if (mask) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } } template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); template __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - T aveval = 0; - bottom_data = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_data[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + T aveval = 0; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_data[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } } template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); @@ -95,150 +95,150 @@ template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AveP template __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < nthreads; index+=tmp) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - T cumsum = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_data[h * width + w]; - return; - } - } - } - } + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + T cumsum = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + } + } + const float thres = rand_idx[index] * cumsum; + // Second pass: get value, and set index. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + if (cumsum >= thres) { + rand_idx[index] = ((n * channels + c) * height + h) * width + w; + top_data[index] = bottom_data[h * width + w]; + return; + } + } + } + } } template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); template __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < count; index+=tmp) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; - T cumsum = FLT_MIN; - T cumvalues = 0.; - bottom_data = bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_data[h * width + w]; - cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum;} + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < count; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; + T cumsum = FLT_MIN; + T cumvalues = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum;} } template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); template __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, - __global int* mask, __global T* top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, __global T* const bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - T gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - top_diff += offset; - if (mask) { - mask = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } else { - top_mask = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } + __global int* mask, __global T* top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + T gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + if (mask) { + mask = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } else { + top_mask = top_mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); template __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total) { - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = (index / width / height) % channels; - int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - T gradient = 0; - top_diff += (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } + } + bottom_diff[index] = gradient; + } } template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); @@ -246,48 +246,48 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave template __kernel void StoPoolBackward(const int nthreads, - __global Dtype* rand_idx, __global Dtype* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global Dtype* bottom_diff) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < nthreads; index += total) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - rand_idx = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - top_diff = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff[ph * pooled_width + pw] * - (index == static_cast(rand_idx[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; + __global Dtype* rand_idx, __global Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global Dtype* bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + rand_idx = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + top_diff = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (index == static_cast(rand_idx[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; - } + } } template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, - __global float* rand_idx, __global float* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global float* bottom_diff); + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, - __global double* rand_idx, __global double* top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, __global double* bottom_diff); + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index de46a5da..5e8c521f 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -26,35 +26,35 @@ template __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) { - int index = get_global_id(0); - if(index < count) { - int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; - } + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; + } } template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor); template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor); template __kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) { - int index = get_global_id(0); - if(index < count) { - int c = (index / dim) % channels / div_factor; - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); - } + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + + (in_data[index] <= 0) * slope_data[c]); + } } template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor); template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); template __kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) { - int index = get_global_id(0); - if(index < count) { - in_diff += offset_out; - out_diff += offset_in; - out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); - } + int index = get_global_id(0); + if(index < count) { + in_diff += offset_out; + out_diff += offset_in; + out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); + } } template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff); template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index 438931ec..7f8bc5b3 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -30,720 +30,817 @@ //we use the open sourced threefry's GPU implementation typedef uint uint32_t; -struct r123array4x32 { uint32_t v[4]; }; - -enum r123_enum_threefry32x4 -{ - R_32x4_0_0 = 10, R_32x4_0_1 = 26, - R_32x4_1_0 = 11, R_32x4_1_1 = 21, - R_32x4_2_0 = 13, R_32x4_2_1 = 27, - R_32x4_3_0 = 23, R_32x4_3_1 = 5, - R_32x4_4_0 = 6, R_32x4_4_1 = 20, - R_32x4_5_0 = 17, R_32x4_5_1 = 11, - R_32x4_6_0 = 25, R_32x4_6_1 = 10, - R_32x4_7_0 = 18, R_32x4_7_1 = 20 +struct r123array4x32 { + uint32_t v[4]; +}; + +enum r123_enum_threefry32x4 { + R_32x4_0_0 = 10, + R_32x4_0_1 = 26, + R_32x4_1_0 = 11, + R_32x4_1_1 = 21, + R_32x4_2_0 = 13, + R_32x4_2_1 = 27, + R_32x4_3_0 = 23, + R_32x4_3_1 = 5, + R_32x4_4_0 = 6, + R_32x4_4_1 = 20, + R_32x4_5_0 = 17, + R_32x4_5_1 = 11, + R_32x4_6_0 = 25, + R_32x4_6_1 = 10, + R_32x4_7_0 = 18, + R_32x4_7_1 = 20 }; -inline uint32_t RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline)); inline uint32_t RotL_32(uint32_t x, unsigned int N) -{ - return (x << (N & 31)) | (x >> ((32 - N) & 31)); + __attribute__((always_inline)); +inline uint32_t RotL_32(uint32_t x, unsigned int N) { + return (x << (N & 31)) | (x >> ((32 - N) & 31)); } typedef struct r123array4x32 threefry4x32_ctr_t; typedef struct r123array4x32 threefry4x32_key_t; typedef struct r123array4x32 threefry4x32_ukey_t; -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline)); -inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) -{ - threefry4x32_ctr_t X; - uint32_t ks[4 + 1]; - int i; - ks[4] = 0x1BD11BDA; - /* - for (i = 0; i < 4; i++) - { - ks[i] = k.v[i]; - X.v[i] = in.v[i]; - ks[4] ^= k.v[i]; - }*/ - { - ks[0] = k.v[0]; - X.v[0] = in.v[0]; - ks[4] ^= k.v[0]; - - ks[1] = k.v[1]; - X.v[1] = in.v[1]; - ks[4] ^= k.v[1]; - - ks[2] = k.v[2]; - X.v[2] = in.v[2]; - ks[4] ^= k.v[2]; - - ks[3] = k.v[3]; - X.v[3] = in.v[3]; - ks[4] ^= k.v[3]; - } - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - if (Nrounds > 0) - { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 1) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 2) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 3) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 3) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 1; - } if (Nrounds > 4) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 5) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 6) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 7) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 7) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 2; - } if (Nrounds > 8) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 9) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 10) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 11) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 11) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 3; - } if (Nrounds > 12) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 13) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 14) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 15) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 15) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 4; - } if (Nrounds > 16) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 17) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 18) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 19) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 19) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 5; - } if (Nrounds > 20) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 21) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 22) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 23) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 23) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 6; - } if (Nrounds > 24) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 25) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 26) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 27) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 27) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 7; - } if (Nrounds > 28) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 29) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 30) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 31) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 31) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 8; - } if (Nrounds > 32) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 33) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 34) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 35) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 35) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 9; - } if (Nrounds > 36) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 37) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 38) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 39) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 39) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 10; - } if (Nrounds > 40) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 41) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 42) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 43) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 43) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 11; - } if (Nrounds > 44) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 45) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 46) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 47) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 47) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 12; - } if (Nrounds > 48) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 49) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 50) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 51) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 51) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 13; - } if (Nrounds > 52) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 53) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 54) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 55) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 55) { - X.v[0] += ks[4]; - X.v[1] += ks[0]; - X.v[2] += ks[1]; - X.v[3] += ks[2]; - X.v[4 - 1] += 14; - } if (Nrounds > 56) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 57) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 58) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 59) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 59) { - X.v[0] += ks[0]; - X.v[1] += ks[1]; - X.v[2] += ks[2]; - X.v[3] += ks[3]; - X.v[4 - 1] += 15; - } if (Nrounds > 60) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 61) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 62) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 63) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 63) { - X.v[0] += ks[1]; - X.v[1] += ks[2]; - X.v[2] += ks[3]; - X.v[3] += ks[4]; - X.v[4 - 1] += 16; - } if (Nrounds > 64) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_0_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_0_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 65) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_1_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_1_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 66) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_2_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_2_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 67) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_3_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_3_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 67) { - X.v[0] += ks[2]; - X.v[1] += ks[3]; - X.v[2] += ks[4]; - X.v[3] += ks[0]; - X.v[4 - 1] += 17; - } if (Nrounds > 68) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_4_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_4_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 69) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_5_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_5_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 70) { - X.v[0] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_6_0); - X.v[1] ^= X.v[0]; - X.v[2] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_6_1); - X.v[3] ^= X.v[2]; - } if (Nrounds > 71) { - X.v[0] += X.v[3]; - X.v[3] = RotL_32(X.v[3], R_32x4_7_0); - X.v[3] ^= X.v[0]; - X.v[2] += X.v[1]; - X.v[1] = RotL_32(X.v[1], R_32x4_7_1); - X.v[1] ^= X.v[2]; - } if (Nrounds > 71) { - X.v[0] += ks[3]; - X.v[1] += ks[4]; - X.v[2] += ks[0]; - X.v[3] += ks[1]; - X.v[4 - 1] += 18; - } - return X; -} +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) { + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; + ks[4] = 0x1BD11BDA; + /* + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ + { + ks[0] = k.v[0]; + X.v[0] = in.v[0]; + ks[4] ^= k.v[0]; + + ks[1] = k.v[1]; + X.v[1] = in.v[1]; + ks[4] ^= k.v[1]; + + ks[2] = k.v[2]; + X.v[2] = in.v[2]; + ks[4] ^= k.v[2]; + + ks[3] = k.v[3]; + X.v[3] = in.v[3]; + ks[4] ^= k.v[3]; + } + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + if (Nrounds > 0) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 1) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 2) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 3) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 3) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 1; + } + if (Nrounds > 4) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 5) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 6) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 7) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 7) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 2; + } + if (Nrounds > 8) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 9) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 10) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 11) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 11) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 3; + } + if (Nrounds > 12) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 13) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 14) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 15) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 15) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 4; + } + if (Nrounds > 16) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 17) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 18) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 19) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 19) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 5; + } + if (Nrounds > 20) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 21) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 22) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 23) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 23) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 6; + } + if (Nrounds > 24) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 25) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 26) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 27) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 27) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 7; + } + if (Nrounds > 28) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 29) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 30) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 31) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 31) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 8; + } + if (Nrounds > 32) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 33) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 34) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 35) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 35) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 9; + } + if (Nrounds > 36) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 37) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 38) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 39) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 39) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 10; + } + if (Nrounds > 40) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 41) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 42) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 43) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 43) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 11; + } + if (Nrounds > 44) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 45) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 46) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 47) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 47) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 12; + } + if (Nrounds > 48) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 49) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 50) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 51) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 51) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 13; + } + if (Nrounds > 52) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 53) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 54) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 55) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 55) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 14; + } + if (Nrounds > 56) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 57) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 58) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 59) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 59) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 15; + } + if (Nrounds > 60) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 61) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 62) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 63) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 63) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 16; + } + if (Nrounds > 64) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 65) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 66) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 67) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 67) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 17; + } + if (Nrounds > 68) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 69) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 70) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 71) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 71) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 18; + } + return X; +} template __kernel void PRNG_threefry4x32_bernoulli( - __global uint4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T inf, - T sup, - T threshold, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); - - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; - - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - - threefry4x32_ctr_t random4; - - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - uint4 frnd; - - frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; - - randomnumber[gdx] = frnd; - } + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } } - template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); @@ -752,133 +849,130 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t template __kernel void PRNG_threefry4x32_uniform( - __global float4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T inf, - T sup, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); - - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; - - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - - threefry4x32_ctr_t random4; - - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - float4 frnd; - frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ); - frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ); - frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ); - frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ); - randomnumber[gdx] = frnd; - } + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + float4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ); + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ); + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ); + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ); + randomnumber[gdx] = frnd; + } } template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm); template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm); - __kernel void PRNG_threefry4x32_uint_uniform( - __global uint4 *randomnumber, - threefry4x32_ctr_t ctr_i, - uint inf, - uint sup, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey; - - ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; - - threefry4x32_ctr_t random4; - - if ( gdx < numrandom ) - { - random4 = threefry4x32_R(nrounds, ctr, ukey); - uint4 frnd; - frnd.x = random4.v[0] % (sup - inf) + inf; - frnd.y = random4.v[1] % (sup - inf) + inf; - frnd.z = random4.v[2] % (sup - inf) + inf; - frnd.w = random4.v[3] % (sup - inf) + inf; - randomnumber[gdx] = frnd; - } + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + uint inf, + uint sup, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + frnd.x = random4.v[0] % (sup - inf) + inf; + frnd.y = random4.v[1] % (sup - inf) + inf; + frnd.z = random4.v[2] % (sup - inf) + inf; + frnd.w = random4.v[3] % (sup - inf) + inf; + randomnumber[gdx] = frnd; + } } - template __kernel void PRNG_threefry4x32_gaussian( - __global float4 *randomnumber, - threefry4x32_ctr_t ctr_i, - T E, - T V, - uint nrounds, - uint numrandom -){ - size_t gdx = get_global_id(0); - - uint maxUint = 0; - maxUint--; - float r = (float)maxUint; - - threefry4x32_ctr_t ctr = ctr_i; - threefry4x32_ukey_t ukey1, ukey2; - - ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx; - ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0; - - threefry4x32_ctr_t random1, random2; - - if ( gdx < numrandom ) - { - random1 = threefry4x32_R(nrounds, ctr, ukey1); - random2 = threefry4x32_R(nrounds, ctr, ukey2); - float4 frnd1; - - float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution - float r2 = (((float)random2.v[0]) / r); - float r3 = (((float)random1.v[1]) / r); - float r4 = (((float)random2.v[1]) / r); - float r5 = (((float)random1.v[2]) / r); - float r6 = (((float)random2.v[2]) / r); - float r7 = (((float)random1.v[3]) / r); - float r8 = (((float)random2.v[3]) / r); - - if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0){ - r2 += 0.0001; - r4 += 0.0001; - r6 += 0.0001; - r8 += 0.0001; - } - - frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data - //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence - frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data - //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence - frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data - //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence - frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data - //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence - - randomnumber[gdx] = frnd1; - } + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T E, + T V, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey1, ukey2; + + ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx; + ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0; + + threefry4x32_ctr_t random1, random2; + + if ( gdx < numrandom ) + { + random1 = threefry4x32_R(nrounds, ctr, ukey1); + random2 = threefry4x32_R(nrounds, ctr, ukey2); + float4 frnd1; + + float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution + float r2 = (((float)random2.v[0]) / r); + float r3 = (((float)random1.v[1]) / r); + float r4 = (((float)random2.v[1]) / r); + float r5 = (((float)random1.v[2]) / r); + float r6 = (((float)random2.v[2]) / r); + float r7 = (((float)random1.v[3]) / r); + float r8 = (((float)random2.v[3]) / r); + + if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) { + r2 += 0.0001; + r4 += 0.0001; + r6 += 0.0001; + r8 += 0.0001; + } + + frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + + randomnumber[gdx] = frnd1; + } } template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm); template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm); - diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl index cf9302d5..e39aa426 100644 --- a/src/caffe/ocl/relu_layer.cl +++ b/src/caffe/ocl/relu_layer.cl @@ -26,9 +26,9 @@ template __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) { - int index = get_global_id(0); - if(index < count) - out[index] = in[index] > 0? in[index]:in[index]*negative_slope; + int index = get_global_id(0); + if(index < count) + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; } template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); @@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUFo template __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) { - int index = get_global_id(0); - if(index < count) { - out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); - } + int index = get_global_id(0); + if(index < count) { + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); + } } template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl index a3a9345f..ac0ef9a9 100644 --- a/src/caffe/ocl/sigmoid_layer.cl +++ b/src/caffe/ocl/sigmoid_layer.cl @@ -26,9 +26,9 @@ template __kernel void SigmoidForward(const int count, __global T* in, __global T* out) { - int index = get_global_id(0); - if(index < count) - out[index] = 1. / (1. + exp(-in[index])); + int index = get_global_id(0); + if(index < count) + out[index] = 1. / (1. + exp(-in[index])); } template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out); @@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void Sig template __kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { - int index = get_global_id(0); - const T sigmoid_x = out_data[index]; - if(index < count) - out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); + int index = get_global_id(0); + const T sigmoid_x = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); } template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl index 6fe0daab..207f0058 100644 --- a/src/caffe/ocl/softmax_layer.cl +++ b/src/caffe/ocl/softmax_layer.cl @@ -27,47 +27,47 @@ template __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) { - int gid = get_global_id(0); - int size = get_global_size(0); + int gid = get_global_id(0); + int size = get_global_size(0); - resultScratch[gid] = 0.0; - for(int i = gid; i < num; i += size) { - resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); - } - barrier(CLK_LOCAL_MEM_FENCE); + resultScratch[gid] = 0.0; + for(int i = gid; i < num; i += size) { + resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); + } + barrier(CLK_LOCAL_MEM_FENCE); - if(gid < 128) - resultScratch[gid] += resultScratch[gid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - if(gid < 64) - resultScratch[gid] += resultScratch[gid + 64]; - if(gid < 32) - resultScratch[gid] += resultScratch[gid + 32]; - if(gid < 16) - resultScratch[gid] += resultScratch[gid + 16]; - if(gid < 8) - resultScratch[gid] += resultScratch[gid + 8]; - if(gid < 4) - resultScratch[gid] += resultScratch[gid + 4]; - if(gid < 2) - resultScratch[gid] += resultScratch[gid + 2]; - if(gid < 1) { - resultScratch[gid] += resultScratch[gid + 1]; - loss[0] = resultScratch[gid]; - } + if(gid < 128) + resultScratch[gid] += resultScratch[gid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + if(gid < 64) + resultScratch[gid] += resultScratch[gid + 64]; + if(gid < 32) + resultScratch[gid] += resultScratch[gid + 32]; + if(gid < 16) + resultScratch[gid] += resultScratch[gid + 16]; + if(gid < 8) + resultScratch[gid] += resultScratch[gid + 8]; + if(gid < 4) + resultScratch[gid] += resultScratch[gid + 4]; + if(gid < 2) + resultScratch[gid] += resultScratch[gid + 2]; + if(gid < 1) { + resultScratch[gid] += resultScratch[gid + 1]; + loss[0] = resultScratch[gid]; + } } template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); template __kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) { - //printf("softmax_div\n"); - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num*dim; index += total) { - int n = index / dim; - data[index] /= scale[n]; - } + //printf("softmax_div\n"); + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num*dim; index += total) { + int n = index / dim; + data[index] /= scale[n]; + } } template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); @@ -75,97 +75,97 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma template __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } } template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* out); + const int spatial_dim, __global const float* data, __global float* out); template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* out); + const int spatial_dim, __global const double* data, __global double* out); template __kernel void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_max, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } } template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); template __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const T* data, __global T* channel_sum) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } } template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const float* data, __global float* channel_sum); + const int spatial_dim, __global const float* data, __global float* channel_sum); template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, __global const double* data, __global double* channel_sum); + const int spatial_dim, __global const double* data, __global double* channel_sum); template __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const T* channel_sum, __global T* data) { - int index = get_global_id(0); - if(index < count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } } template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const float* channel_sum, __global float* data); + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, __global const double* channel_sum, __global double* data); + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); template __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const T* data_1, __global const T* data_2, - __global T* channel_dot) { - int index = get_global_id(0); - if(index < num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - T dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } } template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const float* data_1, __global const float* data_2, - __global float* channel_dot); + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, __global const double* data_1, __global const double* data_2, - __global double* channel_dot); + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl index 70c282e1..731f660c 100644 --- a/src/caffe/ocl/softmaxwithloss_layer.cl +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -26,77 +26,77 @@ template __kernel void SoftmaxLossForwardGPU(const int nthreads, - __global T* prob_data, __global T* label,__global T* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global T* counts) { - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - T(FLT_MIN))); - counts[index] = 1; - } - } + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + if (has_ignore_label_ && label_value == ignore_label_) { + loss[index] = 0; + counts[index] = 0; + } else { + loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], + T(FLT_MIN))); + counts[index] = 1; + } + } } template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global float* prob_data, __global float* label,__global float* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global float* counts); + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, - __global double* prob_data, __global double* label,__global double* loss, - int num, int dim, int spatial_dim, - bool has_ignore_label_, int ignore_label_, - __global double* counts); + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); template __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, - __global T* label,__global T* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, T* counts) { - const int channels = dim / spatial_dim; - int index = get_global_id(0); - if(index < nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { + const int channels = dim / spatial_dim; + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < channels; ++c) { + bottom_diff[n * dim + c * spatial_dim + s] = 0; + } + counts[index] = 0; + } else { + bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; + counts[index] = 1; + } + } } template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, - __global float* label,__global float* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, float* counts); + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, - __global double* label,__global double* bottom_diff, int num, int dim, - int spatial_dim, bool has_ignore_label_, - int ignore_label_, double* counts); + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); template __kernel void scal (const int num, const T alpha, __global T* data) { - int index = get_global_id(0); - int total = get_global_size(0); - for(index; index < num; index += total) { - data[index] = data[index] * alpha; - } + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total) { + data[index] = data[index] * alpha; + } } template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl index a8bd05c9..900f11ea 100644 --- a/src/caffe/ocl/tanh_layer.cl +++ b/src/caffe/ocl/tanh_layer.cl @@ -26,9 +26,9 @@ template __kernel void TanHForward(const int count, __global T* in, __global T* out) { - int index = get_global_id(0); - if(index < count) - out[index] =tanh(in[index]); + int index = get_global_id(0); + if(index < count) + out[index] =tanh(in[index]); } template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out); @@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHFo template __kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { - int index = get_global_id(0); - const T tanhx = out_data[index]; - if(index < count) - out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); + int index = get_global_id(0); + const T tanhx = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); } template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl index 19df83e2..679dbf29 100644 --- a/src/caffe/ocl/threshold_layer.cl +++ b/src/caffe/ocl/threshold_layer.cl @@ -26,9 +26,9 @@ template __kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) { - int index = get_global_id(0); - if(index < count) - out[index] =in[index] > threshold ? 1 : 0; + int index = get_global_id(0); + if(index < count) + out[index] =in[index] > threshold ? 1 : 0; } template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out); diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index 07a16fbd..576a6e98 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -28,10 +28,10 @@ template __kernel void OCL_memset(__global T* buffer, const T value, const int size) { - int gdx = get_global_id(0); - if(gdx < size) { - buffer[gdx] = value; - } + int gdx = get_global_id(0); + if(gdx < size) { + buffer[gdx] = value; + } } template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size); @@ -39,18 +39,18 @@ template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__ template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); __kernel void OCL_memset2(__global int* buffer, const int value, const int size) { - int gdx = get_global_id(0); - if(gdx < size) { - buffer[gdx] = value; - } + int gdx = get_global_id(0); + if(gdx < size) { + buffer[gdx] = value; + } } template __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) { - int gdx = get_global_id(0); - if(gdx < N) { - Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); - } + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); + } } template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); @@ -58,23 +58,23 @@ template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caff template __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) { - int index = get_global_id(0); - if(index < n) { - y[index] = fabs(a[index]); - } + int index = get_global_id(0); + if(index < n) { + y[index] = fabs(a[index]); + } } template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y); template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y); template __kernel void get_max(const int num, const int dim, __global T* data, __global T* out) { - int index = get_global_id(0); - if (index < num) { - T maxval = -FLT_MAX; - for (int i = 0; i < dim; i++) - maxval = max( data[index*dim + i], maxval ); - out[index] = maxval; - } + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } } template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); @@ -82,9 +82,9 @@ template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(co template __kernel void exp (const int num, __global T* data, __global T* out) { - int index = get_global_id(0); - if (index < num) - out[index] = exp(data[index]); + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); } template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); @@ -92,10 +92,10 @@ template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int template __kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] - b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] - b[index]; + } } template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out); @@ -103,10 +103,10 @@ template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_ template __kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] + b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] + b[index]; + } } template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out); @@ -114,10 +114,10 @@ template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_ template __kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] / b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] / b[index]; + } } template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out); @@ -125,10 +125,10 @@ template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_ template __kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = a[index] * b[index]; - } + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] * b[index]; + } } template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out); @@ -136,10 +136,10 @@ template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_ template __kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = pow(data[index], alpha); - } + int index = get_global_id(0); + if(index < count) { + out[index] = pow(data[index], alpha); + } } template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out); @@ -147,10 +147,10 @@ template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel template __kernel void kernel_exp(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = exp(data[index]); - } + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } } template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); @@ -158,10 +158,10 @@ template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_ template __kernel void kernel_add_scalar(const int count, const T data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = out[index] + data; - } + int index = get_global_id(0); + if(index < count) { + out[index] = out[index] + data; + } } template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out); @@ -169,10 +169,10 @@ template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void template __kernel void kernel_log(const int count, __global const T* data, __global T* out) { - int index = get_global_id(0); - if(index < count) { - out[index] = log(data[index]); - } + int index = get_global_id(0); + if(index < count) { + out[index] = log(data[index]); + } } template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out); @@ -180,13 +180,13 @@ template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_ template __kernel void diff (const int num, const int dim, __global T* data, __global T* label) { - int index = get_global_id(0); - int total = get_global_size(0); - int offset; - for(index; index < num; index += total) { - offset = (int) label[index]; - data[index * dim + offset] -= 1; - } + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total) { + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } } template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); @@ -194,9 +194,9 @@ template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const i template __kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) { - int index = get_global_id(0); - if (index < n) - y[index] = a[index] / b[index]; + int index = get_global_id(0); + if (index < n) + y[index] = a[index] / b[index]; } template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); @@ -204,9 +204,9 @@ template __attribute__ ((mangled_name(div_float))) __kernel void div (const int template __kernel void add_scalar (const int n, const T alpha, __global T* y) { - int index = get_global_id(0); - if (index < n) - y[index] += alpha; + int index = get_global_id(0); + if (index < n) + y[index] += alpha; } template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); @@ -214,18 +214,18 @@ template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_sca template __kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { - int index = get_global_id(0); - if (index < n) - y[index] = in1[index] + in2[index]; + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index]; } template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); template __kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) { - int index = get_global_id(0); - if (index < n) - y[index] = a[index] * b[index]; + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; } template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); @@ -233,10 +233,10 @@ template __attribute__ ((mangled_name(element_mul_double))) __kernel void elemen template __kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) { - int index = get_global_id(0); - if (index < n) + int index = get_global_id(0); + if (index < n) // y[index] = a[index] + alpha; - y[index] = pow(a[index], alpha); + y[index] = pow(a[index], alpha); } template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index ffb77b78..8d7f8238 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -15,391 +15,387 @@ namespace caffe { template Solver::Solver(const SolverParameter& param) -: - net_() { - Init(param); + : net_() { + Init(param); } template void Solver::ocl_setup() { - scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); - add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); - div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); - powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); + scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); + add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); + div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); + powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); } template Solver::Solver(const string& param_file) -: - net_() { - SolverParameter param; - ReadProtoFromTextFileOrDie(param_file, ¶m); - Init(param); + : net_() { + SolverParameter param; + ReadProtoFromTextFileOrDie(param_file, ¶m); + Init(param); } template void Solver::Init(const SolverParameter& param) { - LOG(INFO) << "Initializing solver from parameters: " << std::endl - << param.DebugString(); - param_ = param; - CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; - - ocl_setup(); - - if (param_.random_seed() >= 0) { - Caffe::set_random_seed(param_.random_seed()); - } - // Scaffolding code - InitTrainNet(); - InitTestNets(); - LOG(INFO) << "Solver scaffolding done."; - iter_ = 0; - current_step_ = 0; + LOG(INFO) << "Initializing solver from parameters: " << std::endl + << param.DebugString(); + param_ = param; + CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; + + ocl_setup(); + + if (param_.random_seed() >= 0) { + Caffe::set_random_seed(param_.random_seed()); + } + // Scaffolding code + InitTrainNet(); + InitTestNets(); + LOG(INFO) << "Solver scaffolding done."; + iter_ = 0; + current_step_ = 0; } template void Solver::InitTrainNet() { - const int num_train_nets = param_.has_net() + param_.has_net_param() + - param_.has_train_net() + param_.has_train_net_param(); - const string& field_names = "net, net_param, train_net, train_net_param"; - CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " - << "using one of these fields: " << field_names; - CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " - << "one of these fields specifying a train_net: " << field_names; - NetParameter net_param; - if (param_.has_train_net_param()) { - LOG(INFO) << "Creating training net specified in train_net_param."; - net_param.CopyFrom(param_.train_net_param()); - } else if (param_.has_train_net()) { - LOG(INFO) << "Creating training net from train_net file: " - << param_.train_net(); - ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); - } - if (param_.has_net_param()) { - LOG(INFO) << "Creating training net specified in net_param."; - net_param.CopyFrom(param_.net_param()); - } - if (param_.has_net()) { - LOG(INFO) << "Creating training net from net file: " << param_.net(); - ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); - } - // Set the correct NetState. We start with the solver defaults (lowest - // precedence); then, merge in any NetState specified by the net_param itself; - // finally, merge in any NetState specified by the train_state (highest - // precedence). - NetState net_state; - net_state.set_phase(TRAIN); - net_state.MergeFrom(net_param.state()); - net_state.MergeFrom(param_.train_state()); - net_param.mutable_state()->CopyFrom(net_state); - net_.reset(new Net(net_param)); + const int num_train_nets = param_.has_net() + param_.has_net_param() + + param_.has_train_net() + param_.has_train_net_param(); + const string& field_names = "net, net_param, train_net, train_net_param"; + CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " + << "using one of these fields: " << field_names; + CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " + << "one of these fields specifying a train_net: " << field_names; + NetParameter net_param; + if (param_.has_train_net_param()) { + LOG(INFO) << "Creating training net specified in train_net_param."; + net_param.CopyFrom(param_.train_net_param()); + } else if (param_.has_train_net()) { + LOG(INFO) << "Creating training net from train_net file: " + << param_.train_net(); + ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); + } + if (param_.has_net_param()) { + LOG(INFO) << "Creating training net specified in net_param."; + net_param.CopyFrom(param_.net_param()); + } + if (param_.has_net()) { + LOG(INFO) << "Creating training net from net file: " << param_.net(); + ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); + } + // Set the correct NetState. We start with the solver defaults (lowest + // precedence); then, merge in any NetState specified by the net_param itself; + // finally, merge in any NetState specified by the train_state (highest + // precedence). + NetState net_state; + net_state.set_phase(TRAIN); + net_state.MergeFrom(net_param.state()); + net_state.MergeFrom(param_.train_state()); + net_param.mutable_state()->CopyFrom(net_state); + net_.reset(new Net(net_param)); } template void Solver::InitTestNets() { - const bool has_net_param = param_.has_net_param(); - const bool has_net_file = param_.has_net(); - const int num_generic_nets = has_net_param + has_net_file; - CHECK_LE(num_generic_nets, 1) - << "Both net_param and net_file may not be specified."; - const int num_test_net_params = param_.test_net_param_size(); - const int num_test_net_files = param_.test_net_size(); - const int num_test_nets = num_test_net_params + num_test_net_files; - if (num_generic_nets) { - CHECK_GE(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; - } else { - CHECK_EQ(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; - } - // If we have a generic net (specified by net or net_param, rather than - // test_net or test_net_param), we may have an unlimited number of actual - // test networks -- the actual number is given by the number of remaining - // test_iters after any test nets specified by test_net_param and/or test_net - // are evaluated. - const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; - const int num_test_net_instances = num_test_nets + num_generic_net_instances; - if (param_.test_state_size()) { - CHECK_EQ(param_.test_state_size(), num_test_net_instances) - << "test_state must be unspecified or specified once per test net."; - } - if (num_test_net_instances) { - CHECK_GT(param_.test_interval(), 0); - } - int test_net_id = 0; - vector < string > sources(num_test_net_instances); - vector < NetParameter > net_params(num_test_net_instances); - for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { - sources[test_net_id] = "test_net_param"; - net_params[test_net_id].CopyFrom(param_.test_net_param(i)); - } - for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { - sources[test_net_id] = "test_net file: " + param_.test_net(i); - ReadNetParamsFromTextFileOrDie(param_.test_net(i), - &net_params[test_net_id]); - } - const int remaining_test_nets = param_.test_iter_size() - test_net_id; - if (has_net_param) { - for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { - sources[test_net_id] = "net_param"; - net_params[test_net_id].CopyFrom(param_.net_param()); - } - } - if (has_net_file) { - for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { - sources[test_net_id] = "net file: " + param_.net(); - ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); - } - } - test_nets_.resize(num_test_net_instances); - for (int i = 0; i < num_test_net_instances; ++i) { - // Set the correct NetState. We start with the solver defaults (lowest - // precedence); then, merge in any NetState specified by the net_param - // itself; finally, merge in any NetState specified by the test_state - // (highest precedence). - NetState net_state; - net_state.set_phase(TEST); - net_state.MergeFrom(net_params[i].state()); - if (param_.test_state_size()) { - net_state.MergeFrom(param_.test_state(i)); - } - net_params[i].mutable_state()->CopyFrom(net_state); - LOG(INFO) - << "Creating test net (#" << i << ") specified by " << sources[i]; - test_nets_[i].reset(new Net(net_params[i])); - test_nets_[i]->set_debug_info(param_.debug_info()); - } + const bool has_net_param = param_.has_net_param(); + const bool has_net_file = param_.has_net(); + const int num_generic_nets = has_net_param + has_net_file; + CHECK_LE(num_generic_nets, 1) + << "Both net_param and net_file may not be specified."; + const int num_test_net_params = param_.test_net_param_size(); + const int num_test_net_files = param_.test_net_size(); + const int num_test_nets = num_test_net_params + num_test_net_files; + if (num_generic_nets) { + CHECK_GE(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; + } else { + CHECK_EQ(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; + } + // If we have a generic net (specified by net or net_param, rather than + // test_net or test_net_param), we may have an unlimited number of actual + // test networks -- the actual number is given by the number of remaining + // test_iters after any test nets specified by test_net_param and/or test_net + // are evaluated. + const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; + const int num_test_net_instances = num_test_nets + num_generic_net_instances; + if (param_.test_state_size()) { + CHECK_EQ(param_.test_state_size(), num_test_net_instances) + << "test_state must be unspecified or specified once per test net."; + } + if (num_test_net_instances) { + CHECK_GT(param_.test_interval(), 0); + } + int test_net_id = 0; + vector < string > sources(num_test_net_instances); + vector < NetParameter > net_params(num_test_net_instances); + for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { + sources[test_net_id] = "test_net_param"; + net_params[test_net_id].CopyFrom(param_.test_net_param(i)); + } + for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { + sources[test_net_id] = "test_net file: " + param_.test_net(i); + ReadNetParamsFromTextFileOrDie(param_.test_net(i), + &net_params[test_net_id]); + } + const int remaining_test_nets = param_.test_iter_size() - test_net_id; + if (has_net_param) { + for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { + sources[test_net_id] = "net_param"; + net_params[test_net_id].CopyFrom(param_.net_param()); + } + } + if (has_net_file) { + for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { + sources[test_net_id] = "net file: " + param_.net(); + ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); + } + } + test_nets_.resize(num_test_net_instances); + for (int i = 0; i < num_test_net_instances; ++i) { + // Set the correct NetState. We start with the solver defaults (lowest + // precedence); then, merge in any NetState specified by the net_param + // itself; finally, merge in any NetState specified by the test_state + // (highest precedence). + NetState net_state; + net_state.set_phase(TEST); + net_state.MergeFrom(net_params[i].state()); + if (param_.test_state_size()) { + net_state.MergeFrom(param_.test_state(i)); + } + net_params[i].mutable_state()->CopyFrom(net_state); + LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i]; + test_nets_[i].reset(new Net(net_params[i])); + test_nets_[i]->set_debug_info(param_.debug_info()); + } } template void Solver::Step(int iters) { - vector*> bottom_vec; - const int start_iter = iter_; - const int stop_iter = iter_ + iters; - int average_loss = this->param_.average_loss(); - vector < Dtype > losses; - Dtype smoothed_loss = 0; - - while (iter_ < stop_iter) { - // zero-init the params - for (int i = 0; i < net_->params().size(); ++i) { - shared_ptr < Blob > blob = net_->params()[i]; - switch (Caffe::mode()) { - case Caffe::CPU: - caffe_set(blob->count(), static_cast(0), - blob->mutable_cpu_diff()); - break; - case Caffe::GPU: - #ifndef CPU_ONLY - caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + vector*> bottom_vec; + const int start_iter = iter_; + const int stop_iter = iter_ + iters; + int average_loss = this->param_.average_loss(); + vector < Dtype > losses; + Dtype smoothed_loss = 0; + + while (iter_ < stop_iter) { + // zero-init the params + for (int i = 0; i < net_->params().size(); ++i) { + shared_ptr < Blob > blob = net_->params()[i]; + switch (Caffe::mode()) { + case Caffe::CPU: + caffe_set(blob->count(), static_cast(0), + blob->mutable_cpu_diff()); + break; + case Caffe::GPU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - case Caffe::APU: - #ifndef CPU_ONLY - caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); + case Caffe::APU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - } - - if (param_.test_interval() && iter_ % param_.test_interval() == 0 - && (iter_ > 0 || param_.test_initialization())) { - TestAll(); - } - - const bool display = param_.display() && iter_ % param_.display() == 0; - net_->set_debug_info(display && param_.debug_info()); - // accumulate the loss and gradient - Dtype loss = 0; - for (int i = 0; i < param_.iter_size(); ++i) { - loss += net_->ForwardBackward(bottom_vec); - } - loss /= param_.iter_size(); - // average the loss across iterations for smoothed reporting - if (losses.size() < average_loss) { - losses.push_back(loss); - int size = losses.size(); - smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; - } else { - int idx = (iter_ - start_iter) % average_loss; - smoothed_loss += (loss - losses[idx]) / average_loss; - losses[idx] = loss; - } - if (display) { - LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; - const vector*>& result = net_->output_blobs(); - int score_index = 0; - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - const string& output_name = - net_->blob_names()[net_->output_blob_indices()[j]]; - const Dtype loss_weight = - net_->blob_loss_weights()[net_->output_blob_indices()[j]]; - for (int k = 0; k < result[j]->count(); ++k) { - ostringstream loss_msg_stream; - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * result_vec[k] << " loss)"; - } - LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); - } - } - } - ApplyUpdate(); - - // Increment the internal iter_ counter -- its value should always indicate - // the number of times the weights have been updated. - ++iter_; - - // Save a snapshot if needed. - if (param_.snapshot() && iter_ % param_.snapshot() == 0) { - Snapshot(); - } - } + break; + } + } + + if (param_.test_interval() && iter_ % param_.test_interval() == 0 + && (iter_ > 0 || param_.test_initialization())) { + TestAll(); + } + + const bool display = param_.display() && iter_ % param_.display() == 0; + net_->set_debug_info(display && param_.debug_info()); + // accumulate the loss and gradient + Dtype loss = 0; + for (int i = 0; i < param_.iter_size(); ++i) { + loss += net_->ForwardBackward(bottom_vec); + } + loss /= param_.iter_size(); + // average the loss across iterations for smoothed reporting + if (losses.size() < average_loss) { + losses.push_back(loss); + int size = losses.size(); + smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; + } else { + int idx = (iter_ - start_iter) % average_loss; + smoothed_loss += (loss - losses[idx]) / average_loss; + losses[idx] = loss; + } + if (display) { + LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss; + const vector*>& result = net_->output_blobs(); + int score_index = 0; + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + const string& output_name = + net_->blob_names()[net_->output_blob_indices()[j]]; + const Dtype loss_weight = + net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + for (int k = 0; k < result[j]->count(); ++k) { + ostringstream loss_msg_stream; + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight << " = " + << loss_weight * result_vec[k] << " loss)"; + } + LOG(INFO) << " Train net output #" << score_index++ << ": " + << output_name << " = " << result_vec[k] << loss_msg_stream.str(); + } + } + } + ApplyUpdate(); + + // Increment the internal iter_ counter -- its value should always indicate + // the number of times the weights have been updated. + ++iter_; + + // Save a snapshot if needed. + if (param_.snapshot() && iter_ % param_.snapshot() == 0) { + Snapshot(); + } + } } template void Solver::Solve(const char* resume_file) { - LOG(INFO) << "Solving " << net_->name(); - LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); - - if (resume_file) { - LOG(INFO) << "Restoring previous solver status from " << resume_file; - Restore(resume_file); - } - - // For a network that is trained by the solver, no bottom or top vecs - // should be given, and we will just provide dummy vecs. - Step(param_.max_iter() - iter_); - // If we haven't already, save a snapshot after optimization, unless - // overridden by setting snapshot_after_train := false - if (param_.snapshot_after_train() - && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { - Snapshot(); - } - // After the optimization is done, run an additional train and test pass to - // display the train and test loss/outputs if appropriate (based on the - // display and test_interval settings, respectively). Unlike in the rest of - // training, for the train net we only run a forward pass as we've already - // updated the parameters "max_iter" times -- this final pass is only done to - // display the loss, which is computed in the forward pass. - if (param_.display() && iter_ % param_.display() == 0) { - Dtype loss; - net_->ForwardPrefilled(&loss); - LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; - } - if (param_.test_interval() && iter_ % param_.test_interval() == 0) { - TestAll(); - } - LOG(INFO) << "Optimization Done."; + LOG(INFO) << "Solving " << net_->name(); + LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); + + if (resume_file) { + LOG(INFO) << "Restoring previous solver status from " << resume_file; + Restore(resume_file); + } + + // For a network that is trained by the solver, no bottom or top vecs + // should be given, and we will just provide dummy vecs. + Step(param_.max_iter() - iter_); + // If we haven't already, save a snapshot after optimization, unless + // overridden by setting snapshot_after_train := false + if (param_.snapshot_after_train() + && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { + Snapshot(); + } + // After the optimization is done, run an additional train and test pass to + // display the train and test loss/outputs if appropriate (based on the + // display and test_interval settings, respectively). Unlike in the rest of + // training, for the train net we only run a forward pass as we've already + // updated the parameters "max_iter" times -- this final pass is only done to + // display the loss, which is computed in the forward pass. + if (param_.display() && iter_ % param_.display() == 0) { + Dtype loss; + net_->ForwardPrefilled(&loss); + LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; + } + if (param_.test_interval() && iter_ % param_.test_interval() == 0) { + TestAll(); + } + LOG(INFO) << "Optimization Done."; } template void Solver::TestAll() { - for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { - Test(test_net_id); - } + for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { + Test(test_net_id); + } } template void Solver::Test(const int test_net_id) { - LOG(INFO) << "Iteration " << iter_ - << ", Testing net (#" << test_net_id << ")"; - CHECK_NOTNULL(test_nets_[test_net_id].get())-> - ShareTrainedLayersWith(net_.get()); - vector < Dtype > test_score; - vector test_score_output_id; - vector*> bottom_vec; - const shared_ptr >& test_net = test_nets_[test_net_id]; - Dtype loss = 0; - for (int i = 0; i < param_.test_iter(test_net_id); ++i) { - Dtype iter_loss; - const vector*>& result = - test_net->Forward(bottom_vec, &iter_loss); - if (param_.test_compute_loss()) { - loss += iter_loss; - } - if (i == 0) { - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - for (int k = 0; k < result[j]->count(); ++k) { - test_score.push_back(result_vec[k]); - test_score_output_id.push_back(j); - } - } - } else { - int idx = 0; - for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); - for (int k = 0; k < result[j]->count(); ++k) { - test_score[idx++] += result_vec[k]; - } - } - } - } - if (param_.test_compute_loss()) { - loss /= param_.test_iter(test_net_id); - LOG(INFO) << "Test loss: " << loss; - } - for (int i = 0; i < test_score.size(); ++i) { - const int output_blob_index = - test_net->output_blob_indices()[test_score_output_id[i]]; - const string& output_name = test_net->blob_names()[output_blob_index]; - const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; - ostringstream loss_msg_stream; - const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * mean_score << " loss)"; - } - LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " - << mean_score << loss_msg_stream.str(); - } + LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id + << ")"; + CHECK_NOTNULL(test_nets_[test_net_id].get())->ShareTrainedLayersWith( + net_.get()); + vector < Dtype > test_score; + vector test_score_output_id; + vector*> bottom_vec; + const shared_ptr >& test_net = test_nets_[test_net_id]; + Dtype loss = 0; + for (int i = 0; i < param_.test_iter(test_net_id); ++i) { + Dtype iter_loss; + const vector*>& result = test_net->Forward(bottom_vec, + &iter_loss); + if (param_.test_compute_loss()) { + loss += iter_loss; + } + if (i == 0) { + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + for (int k = 0; k < result[j]->count(); ++k) { + test_score.push_back(result_vec[k]); + test_score_output_id.push_back(j); + } + } + } else { + int idx = 0; + for (int j = 0; j < result.size(); ++j) { + const Dtype* result_vec = result[j]->cpu_data(); + for (int k = 0; k < result[j]->count(); ++k) { + test_score[idx++] += result_vec[k]; + } + } + } + } + if (param_.test_compute_loss()) { + loss /= param_.test_iter(test_net_id); + LOG(INFO) << "Test loss: " << loss; + } + for (int i = 0; i < test_score.size(); ++i) { + const int output_blob_index = + test_net->output_blob_indices()[test_score_output_id[i]]; + const string& output_name = test_net->blob_names()[output_blob_index]; + const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; + ostringstream loss_msg_stream; + const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight << " = " + << loss_weight * mean_score << " loss)"; + } + LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " + << mean_score << loss_msg_stream.str(); + } } template void Solver::Snapshot() { - NetParameter net_param; - // For intermediate results, we will also dump the gradient values. - net_->ToProto(&net_param, param_.snapshot_diff()); - string filename(param_.snapshot_prefix()); - string model_filename, snapshot_filename; - const int kBufferSize = 20; - char iter_str_buffer[kBufferSize]; - snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_); - filename += iter_str_buffer; - model_filename = filename + ".caffemodel"; - LOG(INFO) << "Snapshotting to " << model_filename; - WriteProtoToBinaryFile(net_param, model_filename.c_str()); - SolverState state; - SnapshotSolverState(&state); - state.set_iter(iter_); - state.set_learned_net(model_filename); - state.set_current_step(current_step_); - snapshot_filename = filename + ".solverstate"; - LOG(INFO) << "Snapshotting solver state to " << snapshot_filename; - WriteProtoToBinaryFile(state, snapshot_filename.c_str()); + NetParameter net_param; + // For intermediate results, we will also dump the gradient values. + net_->ToProto(&net_param, param_.snapshot_diff()); + string filename(param_.snapshot_prefix()); + string model_filename, snapshot_filename; + const int kBufferSize = 20; + char iter_str_buffer[kBufferSize]; + snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_); + filename += iter_str_buffer; + model_filename = filename + ".caffemodel"; + LOG(INFO) << "Snapshotting to " << model_filename; + WriteProtoToBinaryFile(net_param, model_filename.c_str()); + SolverState state; + SnapshotSolverState(&state); + state.set_iter(iter_); + state.set_learned_net(model_filename); + state.set_current_step(current_step_); + snapshot_filename = filename + ".solverstate"; + LOG(INFO) << "Snapshotting solver state to " << snapshot_filename; + WriteProtoToBinaryFile(state, snapshot_filename.c_str()); } template void Solver::Restore(const char* state_file) { - SolverState state; - NetParameter net_param; - ReadProtoFromBinaryFile(state_file, &state); - if (state.has_learned_net()) { - ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param); - net_->CopyTrainedLayersFrom(net_param); - } - iter_ = state.iter(); - current_step_ = state.current_step(); - RestoreSolverState(state); + SolverState state; + NetParameter net_param; + ReadProtoFromBinaryFile(state_file, &state); + if (state.has_learned_net()) { + ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param); + net_->CopyTrainedLayersFrom(net_param); + } + iter_ = state.iter(); + current_step_ = state.current_step(); + RestoreSolverState(state); } // Return the current learning rate. The currently implemented learning rate @@ -419,382 +415,379 @@ void Solver::Restore(const char* state_file) { // in the solver parameter protocol buffer, and iter is the current iteration. template Dtype SGDSolver::GetLearningRate() { - Dtype rate; - const string& lr_policy = this->param_.lr_policy(); - if (lr_policy == "fixed") { - rate = this->param_.base_lr(); - } else if (lr_policy == "step") { - this->current_step_ = this->iter_ / this->param_.stepsize(); - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); - } else if (lr_policy == "exp") { - rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); - } else if (lr_policy == "inv") { - rate = this->param_.base_lr() * - pow(Dtype(1) + this->param_.gamma() * this->iter_, - -this->param_.power()); - } else if (lr_policy == "multistep") { - if (this->current_step_ < this->param_.stepvalue_size() && - this->iter_ >= this->param_.stepvalue(this->current_step_)) { - this->current_step_++; - LOG(INFO) << "MultiStep Status: Iteration " << - this->iter_ << ", step = " << this->current_step_; - } - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); - } else if (lr_policy == "poly") { - rate = this->param_.base_lr() * pow(Dtype(1.) - - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - this->param_.power()); - } else if (lr_policy == "sigmoid") { - rate = this->param_.base_lr() * (Dtype(1.) / - (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - Dtype(this->param_.stepsize()))))); - } else { - LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; - } - return rate; + Dtype rate; + const string& lr_policy = this->param_.lr_policy(); + if (lr_policy == "fixed") { + rate = this->param_.base_lr(); + } else if (lr_policy == "step") { + this->current_step_ = this->iter_ / this->param_.stepsize(); + rate = this->param_.base_lr() + * pow(this->param_.gamma(), this->current_step_); + } else if (lr_policy == "exp") { + rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); + } else if (lr_policy == "inv") { + rate = this->param_.base_lr() + * pow(Dtype(1) + this->param_.gamma() * this->iter_, + -this->param_.power()); + } else if (lr_policy == "multistep") { + if (this->current_step_ < this->param_.stepvalue_size() + && this->iter_ >= this->param_.stepvalue(this->current_step_)) { + this->current_step_++; + LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = " + << this->current_step_; + } + rate = this->param_.base_lr() + * pow(this->param_.gamma(), this->current_step_); + } else if (lr_policy == "poly") { + rate = this->param_.base_lr() + * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), + this->param_.power()); + } else if (lr_policy == "sigmoid") { + rate = + this->param_.base_lr() + * (Dtype(1.) + / (Dtype(1.) + + exp( + -this->param_.gamma() + * (Dtype(this->iter_) + - Dtype(this->param_.stepsize()))))); + } else { + LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; + } + return rate; } template void SGDSolver::PreSolve() { - // Initialize the history - const vector > >& net_params = this->net_->params(); - history_.clear(); - update_.clear(); - temp_.clear(); - for (int i = 0; i < net_params.size(); ++i) { - const vector& shape = net_params[i]->shape(); - history_.push_back(shared_ptr < Blob > (new Blob(shape))); - update_.push_back(shared_ptr < Blob > (new Blob(shape))); - temp_.push_back(shared_ptr < Blob > (new Blob(shape))); - } + // Initialize the history + const vector > >& net_params = this->net_->params(); + history_.clear(); + update_.clear(); + temp_.clear(); + for (int i = 0; i < net_params.size(); ++i) { + const vector& shape = net_params[i]->shape(); + history_.push_back(shared_ptr < Blob > (new Blob(shape))); + update_.push_back(shared_ptr < Blob > (new Blob(shape))); + temp_.push_back(shared_ptr < Blob > (new Blob(shape))); + } } template void SGDSolver::ClipGradients() { - const Dtype clip_gradients = this->param_.clip_gradients(); - if (clip_gradients < 0) { - return; - } - const vector > >& net_params = this->net_->params(); - Dtype sumsq_diff = 0; - for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - sumsq_diff += net_params[i]->sumsq_diff(); - } - } - const Dtype l2norm_diff = std::sqrt(sumsq_diff); - if (l2norm_diff > clip_gradients) { - Dtype scale_factor = clip_gradients / l2norm_diff; - LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " - << l2norm_diff << " > " << clip_gradients << ") " - << "by scale factor " << scale_factor; - for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - net_params[i]->scale_diff(scale_factor); - } - } - } + const Dtype clip_gradients = this->param_.clip_gradients(); + if (clip_gradients < 0) { + return; + } + const vector > >& net_params = this->net_->params(); + Dtype sumsq_diff = 0; + for (int i = 0; i < net_params.size(); ++i) { + if (this->net_->param_owners()[i] < 0) { + sumsq_diff += net_params[i]->sumsq_diff(); + } + } + const Dtype l2norm_diff = std::sqrt(sumsq_diff); + if (l2norm_diff > clip_gradients) { + Dtype scale_factor = clip_gradients / l2norm_diff; + LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " + << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor " + << scale_factor; + for (int i = 0; i < net_params.size(); ++i) { + if (this->net_->param_owners()[i] < 0) { + net_params[i]->scale_diff(scale_factor); + } + } + } } template void SGDSolver::ApplyUpdate() { - Dtype rate = GetLearningRate(); - if (this->param_.display() && this->iter_ % this->param_.display() == 0) { - LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; - } - ClipGradients(); - for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { - Normalize(param_id); - Regularize(param_id); - ComputeUpdateValue(param_id, rate); - } - this->net_->Update(); + Dtype rate = GetLearningRate(); + if (this->param_.display() && this->iter_ % this->param_.display() == 0) { + LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; + } + ClipGradients(); + for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { + Normalize(param_id); + Regularize(param_id); + ComputeUpdateValue(param_id, rate); + } + this->net_->Update(); } template void SGDSolver::Normalize(int param_id) { - if (this->param_.iter_size() == 1) { - return; - } - // Scale gradient to counterbalance accumulation. - const vector > >& net_params = this->net_->params(); - const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); - switch (Caffe::mode()) { - case Caffe::CPU: { - caffe_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + if (this->param_.iter_size() == 1) { + return; + } + // Scale gradient to counterbalance accumulation. + const vector > >& net_params = this->net_->params(); + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); + switch (Caffe::mode()) { + case Caffe::CPU: { + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_gpu_diff()); + caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } template void SGDSolver::Regularize(int param_id) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_weight_decay = - this->net_->params_weight_decay(); - Dtype weight_decay = this->param_.weight_decay(); - string regularization_type = this->param_.regularization_type(); - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - switch (Caffe::mode()) { - case Caffe::CPU: { - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + switch (Caffe::mode()) { + case Caffe::CPU: { + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_axpy(net_params[param_id]->count(), local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), local_decay, + temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), local_decay, + temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } template void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype momentum = this->param_.momentum(); - Dtype local_rate = rate * net_params_lr[param_id]; - // Compute the update to history, then copy it to the parameter diff. - switch (Caffe::mode()) { - case Caffe::CPU: { - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; + // Compute the update to history, then copy it to the parameter diff. + switch (Caffe::mode()) { + case Caffe::CPU: { + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - caffe_gpu_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + history_[param_id]->mutable_gpu_data()); + caffe_gpu_copy(net_params[param_id]->count(), + history_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } template void SGDSolver::SnapshotSolverState(SolverState* state) { - state->clear_history(); - for (int i = 0; i < history_.size(); ++i) { - // Add history - BlobProto* history_blob = state->add_history(); - history_[i]->ToProto(history_blob); - } + state->clear_history(); + for (int i = 0; i < history_.size(); ++i) { + // Add history + BlobProto* history_blob = state->add_history(); + history_[i]->ToProto(history_blob); + } } template void SGDSolver::RestoreSolverState(const SolverState& state) { - CHECK_EQ(state.history_size(), history_.size()) - << "Incorrect length of history blobs."; - LOG(INFO) << "SGDSolver: restoring history"; - for (int i = 0; i < history_.size(); ++i) { - history_[i]->FromProto(state.history(i)); - } + CHECK_EQ(state.history_size(), history_.size()) + << "Incorrect length of history blobs."; + LOG(INFO) << "SGDSolver: restoring history"; + for (int i = 0; i < history_.size(); ++i) { + history_[i]->FromProto(state.history(i)); + } } template void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype momentum = this->param_.momentum(); - Dtype local_rate = rate * net_params_lr[param_id]; - switch (Caffe::mode()) { - case Caffe::CPU: { - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); - - // compute update: step back then over step - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->cpu_data(), -momentum, - this->update_[param_id]->mutable_cpu_data()); - - // copy - caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; + switch (Caffe::mode()) { + case Caffe::CPU: { + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // update history + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + this->history_[param_id]->mutable_cpu_data()); + + // compute update: step back then over step + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->cpu_data(), -momentum, + this->update_[param_id]->mutable_cpu_data()); + + // copy + caffe_copy(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - this->history_[param_id]->mutable_gpu_data()); - - // compute update: step back then over step - caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->gpu_data(), -momentum, - this->update_[param_id]->mutable_gpu_data()); - - // copy - caffe_gpu_copy(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + // save history momentum for stepping back + caffe_copy(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // update history + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), momentum, + this->history_[param_id]->mutable_gpu_data()); + + // compute update: step back then over step + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, + this->history_[param_id]->gpu_data(), -momentum, + this->update_[param_id]->mutable_gpu_data()); + + // copy + caffe_gpu_copy(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } template void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); - const vector& net_params_lr = this->net_->params_lr(); - Dtype delta = this->param_.delta(); - Dtype local_rate = rate * net_params_lr[param_id]; - switch (Caffe::mode()) { - case Caffe::CPU: { - // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_add(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - this->history_[param_id]->cpu_data(), - this->history_[param_id]->mutable_cpu_data()); - - // prepare update - caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); - - caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); - - caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // scale and copy - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->cpu_data(), Dtype(0), - net_params[param_id]->mutable_cpu_diff()); - break; - } - case Caffe::GPU: { + const vector > >& net_params = this->net_->params(); + const vector& net_params_lr = this->net_->params_lr(); + Dtype delta = this->param_.delta(); + Dtype local_rate = rate * net_params_lr[param_id]; + switch (Caffe::mode()) { + case Caffe::CPU: { + // compute square of gradient in update + caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), + Dtype(2), this->update_[param_id]->mutable_cpu_data()); + + // update history + caffe_add(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + this->history_[param_id]->cpu_data(), + this->history_[param_id]->mutable_cpu_data()); + + // prepare update + caffe_powx(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); + + caffe_add_scalar(net_params[param_id]->count(), delta, + this->update_[param_id]->mutable_cpu_data()); + + caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // scale and copy + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->cpu_data(), Dtype(0), + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { #ifndef CPU_ONLY - // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_add(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - this->history_[param_id]->gpu_data(), - this->history_[param_id]->mutable_gpu_data()); - - // prepare update - caffe_gpu_powx(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // scale and copy - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->gpu_data(), Dtype(0), - net_params[param_id]->mutable_gpu_diff()); + // compute square of gradient in update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); + + // update history + caffe_gpu_add(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + this->history_[param_id]->gpu_data(), + this->history_[param_id]->mutable_gpu_data()); + + // prepare update + caffe_gpu_powx(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_add_scalar < Dtype + > (net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_div(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // scale and copy + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->gpu_data(), Dtype(0), + net_params[param_id]->mutable_gpu_diff()); #else - NO_GPU; + NO_GPU; #endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } } INSTANTIATE_CLASS (Solver); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 976130bf..db470434 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -36,150 +36,149 @@ namespace caffe { SyncedMemory::~SyncedMemory() { - if (cpu_ptr_ && own_cpu_data_) { - OCL_CHECK( - clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, - cpu_ptr_, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); - } - if (gpu_cache_ptr_ && own_cpu_data_) { - OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_)); - } - if (gpu_ptr_) { - OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_)); - } - - clReleaseKernel (oclmem_kernel); + if (cpu_ptr_ && own_cpu_data_) { + OCL_CHECK( + clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + cpu_ptr_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + } + if (gpu_cache_ptr_ && own_cpu_data_) { + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_)); + } + if (gpu_ptr_) { + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_)); + } + + clReleaseKernel (oclmem_kernel); } //begin: code written/modified by AMD. void SyncedMemory::ocl_setup() { - cl_int err = 0; - oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); - OCL_CHECK(err); + cl_int err = 0; + oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + OCL_CHECK(err); } inline void SyncedMemory::to_cpu() { - switch (head_) { - case UNINITIALIZED: - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, - size_, NULL, NULL); - cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, - (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, - size_, - 0, NULL, NULL, NULL); - memset(cpu_ptr_, 0, size_); - head_ = HEAD_AT_CPU; - own_cpu_data_ = true; - break; - case HEAD_AT_GPU: { + switch (head_) { + case UNINITIALIZED: + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, + 0, NULL, NULL, NULL); + memset(cpu_ptr_, 0, size_); + head_ = HEAD_AT_CPU; + own_cpu_data_ = true; + break; + case HEAD_AT_GPU: { #ifndef CPU_ONLY - if (cpu_ptr_ == NULL) { - gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, - CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); - cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, - (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, - size_, 0, NULL, NULL, NULL); - own_cpu_data_ = true; - } - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, - (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); - head_ = SYNCED; + if (cpu_ptr_ == NULL) { + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_, 0, NULL, NULL, NULL); + own_cpu_data_ = true; + } + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, + (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + head_ = SYNCED; #else - NO_GPU; + NO_GPU; #endif - break; - } - case HEAD_AT_CPU: - case SYNCED: - break; - } + break; + } + case HEAD_AT_CPU: + case SYNCED: + break; + } } inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY - switch (head_) { - case UNINITIALIZED: { - cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - size_, NULL, NULL); - if (NULL == tmpMem) { - fprintf(stderr, "Failed to create memory object\n"); - break; - } - ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int))); - gpu_ptr_ = (void*) tmpMem; - head_ = HEAD_AT_GPU; - break; - } - case HEAD_AT_CPU: { - if (gpu_ptr_ == NULL) { - cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - size_, NULL, NULL); - if (NULL == tmpMem) { - fprintf(stderr, "Failed to create memory object\n"); - } - gpu_ptr_ = (void*) tmpMem; - } - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, - (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); - head_ = SYNCED; - break; - } - case HEAD_AT_GPU: - case SYNCED: - break; - } + switch (head_) { + case UNINITIALIZED: { + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, + NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + break; + } + ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int))); + gpu_ptr_ = (void*) tmpMem; + head_ = HEAD_AT_GPU; + break; + } + case HEAD_AT_CPU: { + if (gpu_ptr_ == NULL) { + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + size_, NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + } + gpu_ptr_ = (void*) tmpMem; + } + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + head_ = SYNCED; + break; + } + case HEAD_AT_GPU: + case SYNCED: + break; + } #else - NO_GPU; + NO_GPU; #endif } const void* SyncedMemory::cpu_data() { - to_cpu(); - return (const void*) cpu_ptr_; + to_cpu(); + return (const void*) cpu_ptr_; } void SyncedMemory::set_cpu_data(void* data) { - CHECK(data); - if (own_cpu_data_) { - CaffeFreeHost (cpu_ptr_); - } - cpu_ptr_ = data; - head_ = HEAD_AT_CPU; - own_cpu_data_ = false; + CHECK(data); + if (own_cpu_data_) { + CaffeFreeHost (cpu_ptr_); + } + cpu_ptr_ = data; + head_ = HEAD_AT_CPU; + own_cpu_data_ = false; } const void* SyncedMemory::gpu_data() { #ifndef CPU_ONLY - to_gpu(); - return (const void*) gpu_ptr_; + to_gpu(); + return (const void*) gpu_ptr_; #else - NO_GPU; + NO_GPU; #endif } void* SyncedMemory::mutable_cpu_data() { - to_cpu(); - head_ = HEAD_AT_CPU; - return cpu_ptr_; + to_cpu(); + head_ = HEAD_AT_CPU; + return cpu_ptr_; } void* SyncedMemory::mutable_gpu_data() { #ifndef CPU_ONLY - to_gpu(); - head_ = HEAD_AT_GPU; - return gpu_ptr_; + to_gpu(); + head_ = HEAD_AT_GPU; + return gpu_ptr_; #else - NO_GPU; + NO_GPU; #endif } const void *SyncedMemory::gpu_cache_data() { - return 0; + return 0; } } // namespace caffe diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 4c0ce04e..2dcf0e5a 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -6,114 +6,111 @@ namespace caffe { Timer::Timer() -: - initted_(false), - running_(false), - has_run_at_least_once_(false) { - Init(); + : initted_(false), running_(false), has_run_at_least_once_(false) { + Init(); } Timer::~Timer() { } void Timer::Start() { - if (!running()) { - start_cpu_ = boost::posix_time::microsec_clock::local_time(); - running_ = true; - has_run_at_least_once_ = true; - } + if (!running()) { + start_cpu_ = boost::posix_time::microsec_clock::local_time(); + running_ = true; + has_run_at_least_once_ = true; + } } void Timer::Stop() { - if (running()) { - stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - running_ = false; - } + if (running()) { + stop_cpu_ = boost::posix_time::microsec_clock::local_time(); + running_ = false; + } } float Timer::MicroSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - - elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); - return elapsed_microseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + + elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); + return elapsed_microseconds_; } float Timer::MilliSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - - elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); - return elapsed_milliseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + + elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); + return elapsed_milliseconds_; } float Timer::Seconds() { - return MilliSeconds() / 1000.; + return MilliSeconds() / 1000.; } void Timer::Init() { - if (!initted()) { - if (Caffe::mode() == Caffe::GPU) { - } - initted_ = true; - } + if (!initted()) { + if (Caffe::mode() == Caffe::GPU) { + } + initted_ = true; + } } CPUTimer::CPUTimer() { - this->initted_ = true; - this->running_ = false; - this->has_run_at_least_once_ = false; + this->initted_ = true; + this->running_ = false; + this->has_run_at_least_once_ = false; } void CPUTimer::Start() { - if (!running()) { - this->start_cpu_ = boost::posix_time::microsec_clock::local_time(); - this->running_ = true; - this->has_run_at_least_once_ = true; - } + if (!running()) { + this->start_cpu_ = boost::posix_time::microsec_clock::local_time(); + this->running_ = true; + this->has_run_at_least_once_ = true; + } } void CPUTimer::Stop() { - if (running()) { - this->stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - this->running_ = false; - } + if (running()) { + this->stop_cpu_ = boost::posix_time::microsec_clock::local_time(); + this->running_ = false; + } } float CPUTimer::MilliSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - this->elapsed_milliseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_milliseconds(); - return this->elapsed_milliseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + this->elapsed_milliseconds_ = + (this->stop_cpu_ - this->start_cpu_).total_milliseconds(); + return this->elapsed_milliseconds_; } float CPUTimer::MicroSeconds() { - if (!has_run_at_least_once()) { - LOG(WARNING) << "Timer has never been run before reading time."; - return 0; - } - if (running()) { - Stop(); - } - this->elapsed_microseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_microseconds(); - return this->elapsed_microseconds_; + if (!has_run_at_least_once()) { + LOG(WARNING) << "Timer has never been run before reading time."; + return 0; + } + if (running()) { + Stop(); + } + this->elapsed_microseconds_ = + (this->stop_cpu_ - this->start_cpu_).total_microseconds(); + return this->elapsed_microseconds_; } } // namespace caffe diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp index 43492ce7..592017c5 100644 --- a/src/caffe/util/cudnn.cpp +++ b/src/caffe/util/cudnn.cpp @@ -2,22 +2,22 @@ #include "caffe/util/cudnn.hpp" namespace caffe { - namespace cudnn { + namespace cudnn { - float dataType::oneval = 1.0; - float dataType::zeroval = 0.0; - const void* dataType::one = - static_cast(&dataType::oneval); - const void* dataType::zero = - static_cast(&dataType::zeroval); + float dataType::oneval = 1.0; + float dataType::zeroval = 0.0; + const void* dataType::one = + static_cast(&dataType::oneval); + const void* dataType::zero = + static_cast(&dataType::zeroval); - double dataType::oneval = 1.0; - double dataType::zeroval = 0.0; - const void* dataType::one = - static_cast(&dataType::oneval); - const void* dataType::zero = - static_cast(&dataType::zeroval); + double dataType::oneval = 1.0; + double dataType::zeroval = 0.0; + const void* dataType::one = + static_cast(&dataType::oneval); + const void* dataType::zero = + static_cast(&dataType::zeroval); - } // namespace cudnn + } // namespace cudnn } // namespace caffe #endif diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp index 50d8cbf7..fd4de1bf 100644 --- a/src/caffe/util/db.cpp +++ b/src/caffe/util/db.cpp @@ -8,24 +8,24 @@ namespace caffe { namespace db { DB* GetDB(DataParameter::DB backend) { - switch (backend) { - case DataParameter_DB_LEVELDB: - return new LevelDB(); - case DataParameter_DB_LMDB: - return new LMDB(); - default: - LOG(FATAL) << "Unknown database backend"; - } + switch (backend) { + case DataParameter_DB_LEVELDB: + return new LevelDB(); + case DataParameter_DB_LMDB: + return new LMDB(); + default: + LOG(FATAL) << "Unknown database backend"; + } } DB* GetDB(const string& backend) { - if (backend == "leveldb") { - return new LevelDB(); - } else if (backend == "lmdb") { - return new LMDB(); - } else { - LOG(FATAL) << "Unknown database backend"; - } + if (backend == "leveldb") { + return new LevelDB(); + } else if (backend == "lmdb") { + return new LMDB(); + } else { + LOG(FATAL) << "Unknown database backend"; + } } } // namespace db diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp index d8adce8a..d8eac5f7 100644 --- a/src/caffe/util/db_leveldb.cpp +++ b/src/caffe/util/db_leveldb.cpp @@ -6,16 +6,16 @@ namespace caffe { namespace db { void LevelDB::Open(const string& source, Mode mode) { - leveldb::Options options; - options.block_size = 65536; - options.write_buffer_size = 268435456; - options.max_open_files = 100; - options.error_if_exists = mode == NEW; - options.create_if_missing = mode != READ; - leveldb::Status status = leveldb::DB::Open(options, source, &db_); - CHECK(status.ok()) << "Failed to open leveldb " << source - << std::endl << status.ToString(); - LOG(INFO) << "Opened leveldb " << source; + leveldb::Options options; + options.block_size = 65536; + options.write_buffer_size = 268435456; + options.max_open_files = 100; + options.error_if_exists = mode == NEW; + options.create_if_missing = mode != READ; + leveldb::Status status = leveldb::DB::Open(options, source, &db_); + CHECK(status.ok()) << "Failed to open leveldb " << source << std::endl + << status.ToString(); + LOG(INFO) << "Opened leveldb " << source; } } // namespace db diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp index bc1a0da1..126b3790 100644 --- a/src/caffe/util/db_lmdb.cpp +++ b/src/caffe/util/db_lmdb.cpp @@ -10,42 +10,42 @@ namespace db { const size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB void LMDB::Open(const string& source, Mode mode) { - MDB_CHECK(mdb_env_create(&mdb_env_)); - MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE)); - if(mode == NEW) { - CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed"; - } - int flags = 0; - if (mode == READ) { - flags = MDB_RDONLY | MDB_NOTLS; - } - MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664)); - LOG(INFO) << "Opened lmdb " << source; + MDB_CHECK(mdb_env_create(&mdb_env_)); + MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));if +( mode == NEW) { + CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed"; + } + int flags = 0; + if (mode == READ) { + flags = MDB_RDONLY | MDB_NOTLS; + } + MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664)); + LOG(INFO) << "Opened lmdb " << source; } LMDBCursor* LMDB::NewCursor() { - MDB_txn* mdb_txn; - MDB_cursor* mdb_cursor; - MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn)); - MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); - MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor)); - return new LMDBCursor(mdb_txn, mdb_cursor); + MDB_txn* mdb_txn; + MDB_cursor* mdb_cursor; + MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn)); + MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); + MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor)); + return new LMDBCursor(mdb_txn, mdb_cursor); } LMDBTransaction* LMDB::NewTransaction() { - MDB_txn* mdb_txn; - MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn)); - MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); - return new LMDBTransaction(&mdb_dbi_, mdb_txn); + MDB_txn* mdb_txn; + MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn)); + MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_)); + return new LMDBTransaction(&mdb_dbi_, mdb_txn); } void LMDBTransaction::Put(const string& key, const string& value) { - MDB_val mdb_key, mdb_value; - mdb_key.mv_data = const_cast(key.data()); - mdb_key.mv_size = key.size(); - mdb_value.mv_data = const_cast(value.data()); - mdb_value.mv_size = value.size(); - MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0)); + MDB_val mdb_key, mdb_value; + mdb_key.mv_data = const_cast(key.data()); + mdb_key.mv_size = key.size(); + mdb_value.mv_data = const_cast(value.data()); + mdb_value.mv_size = value.size(); + MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0)); } } // namespace db diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 886ac85b..25349d26 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -37,350 +37,334 @@ namespace caffe { template extern std::string get_dtype_suffix(); template -void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int channels_col = channels * kernel_h * kernel_w; - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % kernel_w; - int h_offset = (c / kernel_w) % kernel_h; - int c_im = c / kernel_h / kernel_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride_h - pad_h + h_offset; - int w_pad = w * stride_w - pad_w + w_offset; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; - else - data_col[(c * height_col + h) * width_col + w] = 0; - } - } - } +void im2col_cpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_col[(c * height_col + h) * width_col + w] = data_im[(c_im + * height + h_pad) * width + w_pad]; + else + data_col[(c * height_col + h) * width_col + w] = 0; + } + } + } } template void im2col_cpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_cpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template -void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im) { - caffe_set(height * width * channels, Dtype(0), data_im); - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int channels_col = channels * patch_h * patch_w; - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % patch_w; - int h_offset = (c / patch_w) % patch_h; - int c_im = c / patch_h / patch_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride_h - pad_h + h_offset; - int w_pad = w * stride_w - pad_w + w_offset; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_im[(c_im * height + h_pad) * width + w_pad] += - data_col[(c * height_col + h) * width_col + w]; - } - } - } +void col2im_cpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { + caffe_set(height * width * channels, Dtype(0), data_im); + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int channels_col = channels * patch_h * patch_w; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % patch_w; + int h_offset = (c / patch_w) % patch_h; + int c_im = c / patch_h / patch_w; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_im[(c_im * height + h_pad) * width + w_pad] += data_col[(c + * height_col + h) * width_col + w]; + } + } + } } template void col2im_cpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im); template void col2im_cpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im); template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset, int optnum) { - std::string kernel_name = "col2im_opt" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_im, const int img_offset, + int optnum) { + std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu_opt(const float* data_col, const int col_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_im, const int img_offset, int optnum); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, float* data_im, const int img_offset, + int optnum); template void col2im_gpu_opt(const double* data_col, - const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_im, const int img_offset, int optnum); + const int col_offset, const int channels, const int height, const int width, + const int ksize, const int pad, const int stride, double* data_im, + const int img_offset, int optnum); template void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col, const int col_offset) - { - std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w); - - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset) { + std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w); + + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void im2col_gpu(const float* data_im, const int img_offset, - const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col, const int col_offset); + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_col, const int col_offset); template void im2col_gpu(const double* data_im, const int img_offset, - const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col, const int col_offset); + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_col, const int col_offset); template -void col2im_gpu(const Dtype* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im, const int img_offset) - { - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, + const int width, const int channels, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu(const float* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, float* data_im, - const int img_offset); + const int height, const int width, const int channels, const int patch_h, + const int patch_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im, const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_im, const int img_offset); + const int height, const int width, const int channels, const int patch_h, + const int patch_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im, const int img_offset); template void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_col, const int col_offset) { + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); } template void im2col_gpu(cl_kernel Kernel, const float* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset); + const int img_offset, const int channels, const int height, const int width, + const int ksize, const int pad, const int stride, float* data_col, + const int col_offset); template void im2col_gpu(cl_kernel Kernel, const double* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset); + const int img_offset, const int channels, const int height, const int width, + const int ksize, const int pad, const int stride, double* data_col, + const int col_offset); template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col, const int col_offset, int optnum) { - - std::string kernel_name = "im2col_opt" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = optnum * channels * height_col * width_col; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, Dtype* data_col, const int col_offset, + int optnum) { + + std::string kernel_name = "im2col_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = optnum * channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void im2col_gpu_opt(const float* data_im, const int img_offset, - const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col, const int col_offset, int optnum); + const int channels, const int height, const int width, const int ksize, + const int pad, const int stride, float* data_col, const int col_offset, + int optnum); template void im2col_gpu_opt(const double* data_im, - const int img_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col, const int col_offset, int optnum); + const int img_offset, const int channels, const int height, const int width, + const int ksize, const int pad, const int stride, double* data_col, + const int col_offset, int optnum); template void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset) { - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void col2im_gpu(const float* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im, const int img_offset); + const int channels, const int height, const int width, const int psize, + const int pad, const int stride, float* data_im, const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im, const int img_offset); + const int channels, const int height, const int width, const int psize, + const int pad, const int stride, double* data_im, const int img_offset); } // namespace caffe diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index 6435427e..0848017a 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -10,124 +10,121 @@ namespace caffe { template __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_col) { - CUDA_KERNEL_LOOP(index, n) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - Dtype* data_col_ptr = data_col; - data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; - const Dtype* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + i; - int w = w_in + j; - *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; - data_col_ptr += height_col * width_col; - } - } - } + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_col) { + CUDA_KERNEL_LOOP(index, n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + Dtype* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + const Dtype* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } } template -void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { - // We are going to launch channels * height_col * width_col kernels, each - // kernel responsible for copying a single-channel grid. - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int num_kernels = channels * height_col * width_col; - // NOLINT_NEXT_LINE(whitespace/operators) - im2col_gpu_kernel<<>>( - num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, - pad_w, stride_h, stride_w, height_col, - width_col, data_col); - CUDA_POST_KERNEL_CHECK; +void im2col_gpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + // NOLINT_NEXT_LINE(whitespace/operators) + im2col_gpu_kernel<<>>( + num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, + pad_w, stride_h, stride_w, height_col, + width_col, data_col); + CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void im2col_gpu(const float* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_gpu(const double* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, const int stride_w, - double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, - const int height, const int width, const int channels, - const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int height_col, const int width_col, - Dtype* data_im) { - CUDA_KERNEL_LOOP(index, n) { - Dtype val = 0; - int w = index % width + pad_w; - int h = (index / width) % height + pad_h; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; - int w_col_end = min(w / stride_w + 1, width_col); - int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; - int h_col_end = min(h / stride_h + 1, height_col); - // equivalent implementation - int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; - int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; - int coeff_w_col = (1 - stride_w * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + Dtype* data_im) { + CUDA_KERNEL_LOOP(index, n) { + Dtype val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } } template -void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im) { - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int num_kernels = channels * height * width; - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - // NOLINT_NEXT_LINE(whitespace/operators) - col2im_gpu_kernel<<>>( - num_kernels, data_col, height, width, channels, patch_h, patch_w, - pad_h, pad_w, stride_h, stride_w, - height_col, width_col, data_im); - CUDA_POST_KERNEL_CHECK; +void col2im_gpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + col2im_gpu_kernel<<>>( + num_kernels, data_col, height, width, channels, patch_h, patch_w, + pad_h, pad_w, stride_h, stride_w, + height_col, width_col, data_im); + CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void col2im_gpu(const float* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im); template void col2im_gpu(const double* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im); } // namespace caffe diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp index 299d1fd0..7974b0ea 100644 --- a/src/caffe/util/insert_splits.cpp +++ b/src/caffe/util/insert_splits.cpp @@ -10,135 +10,136 @@ namespace caffe { void InsertSplits(const NetParameter& param, NetParameter* param_split) { - // Initialize by copying from the input NetParameter. - param_split->CopyFrom(param); - param_split->clear_layer(); - map > blob_name_to_last_top_idx; - map, pair > bottom_idx_to_source_top_idx; - map, int> top_idx_to_bottom_count; - map, float> top_idx_to_loss_weight; - map, int> top_idx_to_bottom_split_idx; - map layer_idx_to_layer_name; - layer_idx_to_layer_name[-1] = "input"; - // Determine the number of times each blob is used as an input (bottom) blob. - for (int i = 0; i < param.input_size(); ++i) { - const string& blob_name = param.input(i); - blob_name_to_last_top_idx[blob_name] = make_pair(-1, i); - } - for (int i = 0; i < param.layer_size(); ++i) { - const LayerParameter& layer_param = param.layer(i); - layer_idx_to_layer_name[i] = layer_param.name(); - for (int j = 0; j < layer_param.bottom_size(); ++j) { - const string& blob_name = layer_param.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { - LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; - } - const pair& bottom_idx = make_pair(i, j); - const pair& top_idx = blob_name_to_last_top_idx[blob_name]; - bottom_idx_to_source_top_idx[bottom_idx] = top_idx; - ++top_idx_to_bottom_count[top_idx]; - } - for (int j = 0; j < layer_param.top_size(); ++j) { - const string& blob_name = layer_param.top(j); - blob_name_to_last_top_idx[blob_name] = make_pair(i, j); - } - // A use of a top blob as a loss should be handled similarly to the use of - // a top blob as an input (bottom) blob to another layer. - const int last_loss = - std::min(layer_param.loss_weight_size(), layer_param.top_size()); - for (int j = 0; j < last_loss; ++j) { - const string& blob_name = layer_param.top(j); - const pair& top_idx = blob_name_to_last_top_idx[blob_name]; - top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j); - if (top_idx_to_loss_weight[top_idx]) { - ++top_idx_to_bottom_count[top_idx]; - } - } - } - // Create split layer for any input blobs used by other layer as bottom - // blobs more than once. - for (int i = 0; i < param.input_size(); ++i) { - const int split_count = top_idx_to_bottom_count[make_pair(-1, i)]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[-1]; - const string& blob_name = param.input(i); - LayerParameter* split_layer_param = param_split->add_layer(); - const float kZeroLossWeight = 0; - ConfigureSplitLayer(layer_name, blob_name, i, split_count, - kZeroLossWeight, split_layer_param); - } - } - for (int i = 0; i < param.layer_size(); ++i) { - LayerParameter* layer_param = param_split->add_layer(); - layer_param->CopyFrom(param.layer(i)); - // Replace any shared bottom blobs with split layer outputs. - for (int j = 0; j < layer_param->bottom_size(); ++j) { - const pair& top_idx = - bottom_idx_to_source_top_idx[make_pair(i, j)]; - const int split_count = top_idx_to_bottom_count[top_idx]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[top_idx.first]; - const string& blob_name = layer_param->bottom(j); - layer_param->set_bottom(j, SplitBlobName(layer_name, - blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); - } - } - // Create split layer for any top blobs used by other layer as bottom - // blobs more than once. - for (int j = 0; j < layer_param->top_size(); ++j) { - const pair& top_idx = make_pair(i, j); - const int split_count = top_idx_to_bottom_count[top_idx]; - if (split_count > 1) { - const string& layer_name = layer_idx_to_layer_name[i]; - const string& blob_name = layer_param->top(j); - LayerParameter* split_layer_param = param_split->add_layer(); - const float loss_weight = top_idx_to_loss_weight[top_idx]; - ConfigureSplitLayer(layer_name, blob_name, j, split_count, - loss_weight, split_layer_param); - if (loss_weight) { - layer_param->clear_loss_weight(); - top_idx_to_bottom_split_idx[top_idx]++; - } - } - } - } + // Initialize by copying from the input NetParameter. + param_split->CopyFrom(param); + param_split->clear_layer(); + map > blob_name_to_last_top_idx; + map, pair > bottom_idx_to_source_top_idx; + map, int> top_idx_to_bottom_count; + map, float> top_idx_to_loss_weight; + map, int> top_idx_to_bottom_split_idx; + map layer_idx_to_layer_name; + layer_idx_to_layer_name[-1] = "input"; + // Determine the number of times each blob is used as an input (bottom) blob. + for (int i = 0; i < param.input_size(); ++i) { + const string& blob_name = param.input(i); + blob_name_to_last_top_idx[blob_name] = make_pair(-1, i); + } + for (int i = 0; i < param.layer_size(); ++i) { + const LayerParameter& layer_param = param.layer(i); + layer_idx_to_layer_name[i] = layer_param.name(); + for (int j = 0; j < layer_param.bottom_size(); ++j) { + const string& blob_name = layer_param.bottom(j); + if (blob_name_to_last_top_idx.find(blob_name) + == blob_name_to_last_top_idx.end()) { + LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; + } + const pair& bottom_idx = make_pair(i, j); + const pair& top_idx = blob_name_to_last_top_idx[blob_name]; + bottom_idx_to_source_top_idx[bottom_idx] = top_idx; + ++top_idx_to_bottom_count[top_idx]; + } + for (int j = 0; j < layer_param.top_size(); ++j) { + const string& blob_name = layer_param.top(j); + blob_name_to_last_top_idx[blob_name] = make_pair(i, j); + } + // A use of a top blob as a loss should be handled similarly to the use of + // a top blob as an input (bottom) blob to another layer. + const int last_loss = std::min(layer_param.loss_weight_size(), + layer_param.top_size()); + for (int j = 0; j < last_loss; ++j) { + const string& blob_name = layer_param.top(j); + const pair& top_idx = blob_name_to_last_top_idx[blob_name]; + top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j); + if (top_idx_to_loss_weight[top_idx]) { + ++top_idx_to_bottom_count[top_idx]; + } + } + } + // Create split layer for any input blobs used by other layer as bottom + // blobs more than once. + for (int i = 0; i < param.input_size(); ++i) { + const int split_count = top_idx_to_bottom_count[make_pair(-1, i)]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[-1]; + const string& blob_name = param.input(i); + LayerParameter* split_layer_param = param_split->add_layer(); + const float kZeroLossWeight = 0; + ConfigureSplitLayer(layer_name, blob_name, i, split_count, + kZeroLossWeight, split_layer_param); + } + } + for (int i = 0; i < param.layer_size(); ++i) { + LayerParameter* layer_param = param_split->add_layer(); + layer_param->CopyFrom(param.layer(i)); + // Replace any shared bottom blobs with split layer outputs. + for (int j = 0; j < layer_param->bottom_size(); ++j) { + const pair& top_idx = bottom_idx_to_source_top_idx[make_pair(i, + j)]; + const int split_count = top_idx_to_bottom_count[top_idx]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[top_idx.first]; + const string& blob_name = layer_param->bottom(j); + layer_param->set_bottom(j, + SplitBlobName(layer_name, blob_name, top_idx.second, + top_idx_to_bottom_split_idx[top_idx]++)); + } + } + // Create split layer for any top blobs used by other layer as bottom + // blobs more than once. + for (int j = 0; j < layer_param->top_size(); ++j) { + const pair& top_idx = make_pair(i, j); + const int split_count = top_idx_to_bottom_count[top_idx]; + if (split_count > 1) { + const string& layer_name = layer_idx_to_layer_name[i]; + const string& blob_name = layer_param->top(j); + LayerParameter* split_layer_param = param_split->add_layer(); + const float loss_weight = top_idx_to_loss_weight[top_idx]; + ConfigureSplitLayer(layer_name, blob_name, j, split_count, loss_weight, + split_layer_param); + if (loss_weight) { + layer_param->clear_loss_weight(); + top_idx_to_bottom_split_idx[top_idx]++; + } + } + } + } } void ConfigureSplitLayer(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_count, const float loss_weight, - LayerParameter* split_layer_param) { - split_layer_param->Clear(); - split_layer_param->add_bottom(blob_name); - split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx)); - split_layer_param->set_type("Split"); - for (int k = 0; k < split_count; ++k) { - split_layer_param->add_top( - SplitBlobName(layer_name, blob_name, blob_idx, k)); - if (loss_weight) { - if (k == 0) { - split_layer_param->add_loss_weight(loss_weight); - } else { - split_layer_param->add_loss_weight(0); - } - } - } + const int blob_idx, const int split_count, const float loss_weight, + LayerParameter* split_layer_param) { + split_layer_param->Clear(); + split_layer_param->add_bottom(blob_name); + split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx)); + split_layer_param->set_type("Split"); + for (int k = 0; k < split_count; ++k) { + split_layer_param->add_top( + SplitBlobName(layer_name, blob_name, blob_idx, k)); + if (loss_weight) { + if (k == 0) { + split_layer_param->add_loss_weight(loss_weight); + } else { + split_layer_param->add_loss_weight(0); + } + } + } } string SplitLayerName(const string& layer_name, const string& blob_name, - const int blob_idx) { - ostringstream split_layer_name; - split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split"; - return split_layer_name.str(); + const int blob_idx) { + ostringstream split_layer_name; + split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx + << "_split"; + return split_layer_name.str(); } string SplitBlobName(const string& layer_name, const string& blob_name, - const int blob_idx, const int split_idx) { - ostringstream split_blob_name; - split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx - << "_split_" << split_idx; - return split_blob_name.str(); + const int blob_idx, const int split_idx) { + ostringstream split_blob_name; + split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx + << "_split_" << split_idx; + return split_blob_name.str(); } } // namespace caffe diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index 63dcf312..09824880 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -30,277 +30,271 @@ using google::protobuf::io::CodedOutputStream; using google::protobuf::Message; bool ReadProtoFromTextFile(const char* filename, Message* proto) { - int fd = open(filename, O_RDONLY); - CHECK_NE(fd, -1) << "File not found: " << filename; - FileInputStream* input = new FileInputStream(fd); - bool success = google::protobuf::TextFormat::Parse(input, proto); - delete input; - close(fd); - return success; + int fd = open(filename, O_RDONLY); + CHECK_NE(fd, -1) << "File not found: " << filename; + FileInputStream* input = new FileInputStream(fd); + bool success = google::protobuf::TextFormat::Parse(input, proto); + delete input; + close(fd); + return success; } void WriteProtoToTextFile(const Message& proto, const char* filename) { - int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); - FileOutputStream* output = new FileOutputStream(fd); - CHECK(google::protobuf::TextFormat::Print(proto, output)); - delete output; - close(fd); + int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + FileOutputStream* output = new FileOutputStream(fd); + CHECK(google::protobuf::TextFormat::Print(proto, output)); + delete output; + close(fd); } bool ReadProtoFromBinaryFile(const char* filename, Message* proto) { - int fd = open(filename, O_RDONLY); - CHECK_NE(fd, -1) << "File not found: " << filename; - ZeroCopyInputStream* raw_input = new FileInputStream(fd); - CodedInputStream* coded_input = new CodedInputStream(raw_input); - coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912); + int fd = open(filename, O_RDONLY); + CHECK_NE(fd, -1) << "File not found: " << filename; + ZeroCopyInputStream* raw_input = new FileInputStream(fd); + CodedInputStream* coded_input = new CodedInputStream(raw_input); + coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912); - bool success = proto->ParseFromCodedStream(coded_input); + bool success = proto->ParseFromCodedStream(coded_input); - delete coded_input; - delete raw_input; - close(fd); - return success; + delete coded_input; + delete raw_input; + close(fd); + return success; } void WriteProtoToBinaryFile(const Message& proto, const char* filename) { - fstream output(filename, ios::out | ios::trunc | ios::binary); - CHECK(proto.SerializeToOstream(&output)); + fstream output(filename, ios::out | ios::trunc | ios::binary); + CHECK(proto.SerializeToOstream(&output)); } -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color) { - cv::Mat cv_img; - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); - cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag); - if (!cv_img_origin.data) { - LOG(ERROR) << "Could not open or find file " << filename; - return cv_img_origin; - } - if (height > 0 && width > 0) { - cv::resize(cv_img_origin, cv_img, cv::Size(width, height)); - } else { - cv_img = cv_img_origin; - } - return cv_img; +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width, const bool is_color) { + cv::Mat cv_img; + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); + cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag); + if (!cv_img_origin.data) { + LOG(ERROR) << "Could not open or find file " << filename; + return cv_img_origin; + } + if (height > 0 && width > 0) { + cv::resize(cv_img_origin, cv_img, cv::Size(width, height)); + } else { + cv_img = cv_img_origin; + } + return cv_img; } -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width) { - return ReadImageToCVMat(filename, height, width, true); +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width) { + return ReadImageToCVMat(filename, height, width, true); } -cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color) { - return ReadImageToCVMat(filename, 0, 0, is_color); +cv::Mat ReadImageToCVMat(const string& filename, const bool is_color) { + return ReadImageToCVMat(filename, 0, 0, is_color); } cv::Mat ReadImageToCVMat(const string& filename) { - return ReadImageToCVMat(filename, 0, 0, true); + return ReadImageToCVMat(filename, 0, 0, true); } // Do the file extension and encoding match? -static bool matchExt(const std::string & fn, - std::string en) { - size_t p = fn.rfind('.'); - std::string ext = p != fn.npos ? fn.substr(p) : fn; - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - std::transform(en.begin(), en.end(), en.begin(), ::tolower); - if (ext == en) - return true; - if (en == "jpg" && ext == "jpeg") - return true; - return false; +static bool matchExt(const std::string & fn, std::string en) { + size_t p = fn.rfind('.'); + std::string ext = p != fn.npos ? fn.substr(p) : fn; + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + std::transform(en.begin(), en.end(), en.begin(), ::tolower); + if (ext == en) + return true; + if (en == "jpg" && ext == "jpeg") + return true; + return false; } -bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum) { - cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); - if (cv_img.data) { - if (encoding.size()) { - if ((cv_img.channels() == 3) == is_color && !height && !width && - matchExt(filename, encoding)) - return ReadFileToDatum(filename, label, datum); - std::vector < uchar > buf; - cv::imencode("." + encoding, cv_img, buf); - datum->set_data(std::string(reinterpret_cast(&buf[0]), - buf.size())); - datum->set_label(label); - datum->set_encoded(true); - return true; - } - CVMatToDatum(cv_img, datum); - datum->set_label(label); - return true; - } else { - return false; - } +bool ReadImageToDatum(const string& filename, const int label, const int height, + const int width, const bool is_color, const std::string & encoding, + Datum* datum) { + cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); + if (cv_img.data) { + if (encoding.size()) { + if ((cv_img.channels() == 3) == is_color && !height && !width + && matchExt(filename, encoding)) + return ReadFileToDatum(filename, label, datum); + std::vector < uchar > buf; + cv::imencode("." + encoding, cv_img, buf); + datum->set_data( + std::string(reinterpret_cast(&buf[0]), buf.size())); + datum->set_label(label); + datum->set_encoded(true); + return true; + } + CVMatToDatum(cv_img, datum); + datum->set_label(label); + return true; + } else { + return false; + } } -bool ReadFileToDatum(const string& filename, const int label, - Datum* datum) { - std::streampos size; +bool ReadFileToDatum(const string& filename, const int label, Datum* datum) { + std::streampos size; - fstream file(filename.c_str(), ios::in | ios::binary | ios::ate); - if (file.is_open()) { - size = file.tellg(); - std::string buffer(size, ' '); - file.seekg(0, ios::beg); - file.read(&buffer[0], size); - file.close(); - datum->set_data(buffer); - datum->set_label(label); - datum->set_encoded(true); - return true; - } else { - return false; - } + fstream file(filename.c_str(), ios::in | ios::binary | ios::ate); + if (file.is_open()) { + size = file.tellg(); + std::string buffer(size, ' '); + file.seekg(0, ios::beg); + file.read(&buffer[0], size); + file.close(); + datum->set_data(buffer); + datum->set_label(label); + datum->set_encoded(true); + return true; + } else { + return false; + } } cv::Mat DecodeDatumToCVMatNative(const Datum& datum) { - cv::Mat cv_img; - CHECK(datum.encoded()) << "Datum not encoded"; - const string& data = datum.data(); - std::vector vec_data(data.c_str(), data.c_str() + data.size()); - cv_img = cv::imdecode(vec_data, -1); - if (!cv_img.data) { - LOG(ERROR) << "Could not decode datum "; - } - return cv_img; + cv::Mat cv_img; + CHECK(datum.encoded()) << "Datum not encoded"; + const string& data = datum.data(); + std::vector vec_data(data.c_str(), data.c_str() + data.size()); + cv_img = cv::imdecode(vec_data, -1); + if (!cv_img.data) { + LOG(ERROR) << "Could not decode datum "; + } + return cv_img; } cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) { - cv::Mat cv_img; - CHECK(datum.encoded()) << "Datum not encoded"; - const string& data = datum.data(); - std::vector vec_data(data.c_str(), data.c_str() + data.size()); - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); - cv_img = cv::imdecode(vec_data, cv_read_flag); - if (!cv_img.data) { - LOG(ERROR) << "Could not decode datum "; - } - return cv_img; + cv::Mat cv_img; + CHECK(datum.encoded()) << "Datum not encoded"; + const string& data = datum.data(); + std::vector vec_data(data.c_str(), data.c_str() + data.size()); + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); + cv_img = cv::imdecode(vec_data, cv_read_flag); + if (!cv_img.data) { + LOG(ERROR) << "Could not decode datum "; + } + return cv_img; } // If Datum is encoded will decoded using DecodeDatumToCVMat and CVMatToDatum // If Datum is not encoded will do nothing bool DecodeDatumNative(Datum* datum) { - if (datum->encoded()) { - cv::Mat cv_img = DecodeDatumToCVMatNative((*datum)); - CVMatToDatum(cv_img, datum); - return true; - } else { - return false; - } + if (datum->encoded()) { + cv::Mat cv_img = DecodeDatumToCVMatNative((*datum)); + CVMatToDatum(cv_img, datum); + return true; + } else { + return false; + } } bool DecodeDatum(Datum* datum, bool is_color) { - if (datum->encoded()) { - cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color); - CVMatToDatum(cv_img, datum); - return true; - } else { - return false; - } + if (datum->encoded()) { + cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color); + CVMatToDatum(cv_img, datum); + return true; + } else { + return false; + } } void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { - CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; - datum->set_channels(cv_img.channels()); - datum->set_height(cv_img.rows); - datum->set_width(cv_img.cols); - datum->clear_data(); - datum->clear_float_data(); - datum->set_encoded(false); - int datum_channels = datum->channels(); - int datum_height = datum->height(); - int datum_width = datum->width(); - int datum_size = datum_channels * datum_height * datum_width; - std::string buffer(datum_size, ' '); - for (int h = 0; h < datum_height; ++h) { - const uchar* ptr = cv_img.ptr < uchar > (h); - int img_index = 0; - for (int w = 0; w < datum_width; ++w) { - for (int c = 0; c < datum_channels; ++c) { - int datum_index = (c * datum_height + h) * datum_width + w; - buffer[datum_index] = static_cast(ptr[img_index++]); - } - } - } - datum->set_data(buffer); + CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte"; + datum->set_channels(cv_img.channels()); + datum->set_height(cv_img.rows); + datum->set_width(cv_img.cols); + datum->clear_data(); + datum->clear_float_data(); + datum->set_encoded(false); + int datum_channels = datum->channels(); + int datum_height = datum->height(); + int datum_width = datum->width(); + int datum_size = datum_channels * datum_height * datum_width; + std::string buffer(datum_size, ' '); + for (int h = 0; h < datum_height; ++h) { + const uchar* ptr = cv_img.ptr < uchar > (h); + int img_index = 0; + for (int w = 0; w < datum_width; ++w) { + for (int c = 0; c < datum_channels; ++c) { + int datum_index = (c * datum_height + h) * datum_width + w; + buffer[datum_index] = static_cast(ptr[img_index++]); + } + } + } + datum->set_data(buffer); } // Verifies format of data stored in HDF5 file and reshapes blob accordingly. template -void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob) { - // Verify that the dataset exists. - CHECK(H5LTfind_dataset(file_id, dataset_name_)) - << "Failed to find HDF5 dataset " << dataset_name_; - // Verify that the number of dimensions is in the accepted range. - herr_t status; - int ndims; - status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims); - CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_; - CHECK_GE(ndims, min_dim); - CHECK_LE(ndims, max_dim); +void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_, + int min_dim, int max_dim, Blob* blob) { + // Verify that the dataset exists. + CHECK(H5LTfind_dataset(file_id, dataset_name_)) + << "Failed to find HDF5 dataset " << dataset_name_; + // Verify that the number of dimensions is in the accepted range. + herr_t status; + int ndims; + status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims); + CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_; + CHECK_GE(ndims, min_dim); + CHECK_LE(ndims, max_dim); - // Verify that the data format is what we expect: float or double. - std::vector < hsize_t > dims(ndims); - H5T_class_t class_; - status = H5LTget_dataset_info( - file_id, dataset_name_, dims.data(), &class_, NULL); - CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; - CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; + // Verify that the data format is what we expect: float or double. + std::vector < hsize_t > dims(ndims); + H5T_class_t class_; + status = H5LTget_dataset_info(file_id, dataset_name_, dims.data(), &class_, + NULL); + CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; + CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; - vector blob_dims(dims.size()); - for (int i = 0; i < dims.size(); ++i) { - blob_dims[i] = dims[i]; - } - blob->Reshape(blob_dims); + vector blob_dims(dims.size()); + for (int i = 0; i < dims.size(); ++i) { + blob_dims[i] = dims[i]; + } + blob->Reshape(blob_dims); } template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { - hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_float( - file_id, dataset_name_, blob->mutable_cpu_data()); - CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; + int min_dim, int max_dim, Blob* blob) { + hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); + herr_t status = H5LTread_dataset_float(file_id, dataset_name_, + blob->mutable_cpu_data()); + CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; } template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { - hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_double( - file_id, dataset_name_, blob->mutable_cpu_data()); - CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; + int min_dim, int max_dim, Blob* blob) { + hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); + herr_t status = H5LTread_dataset_double(file_id, dataset_name_, + blob->mutable_cpu_data()); + CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; } template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; +void hdf5_save_nd_dataset(const hid_t file_id, + const string& dataset_name, const Blob& blob) { + hsize_t dims[HDF5_NUM_DIMS]; + dims[0] = blob.num(); + dims[1] = blob.channels(); + dims[2] = blob.height(); + dims[3] = blob.width(); + herr_t status = H5LTmake_dataset_float(file_id, dataset_name.c_str(), + HDF5_NUM_DIMS, dims, blob.cpu_data()); + CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; } template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; +void hdf5_save_nd_dataset(const hid_t file_id, + const string& dataset_name, const Blob& blob) { + hsize_t dims[HDF5_NUM_DIMS]; + dims[0] = blob.num(); + dims[1] = blob.channels(); + dims[2] = blob.height(); + dims[3] = blob.width(); + herr_t status = H5LTmake_dataset_double(file_id, dataset_name.c_str(), + HDF5_NUM_DIMS, dims, blob.cpu_data()); + CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; } } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 3275d75c..0dfb1107 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -42,319 +42,308 @@ namespace caffe { template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); } template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); } template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); - CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, - 0, - ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, - 0, - ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const int offA, const float* B, - const int offB, const float beta, float* C, const int offC) { - cl_event event; - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, - (cl_mem) C, - offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); - return event; + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, + &event)); + return event; } template <> cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const int offA, const double* B, - const int offB, const double beta, double* C, const int offC) { - cl_event event; - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, - (cl_mem) C, - offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); - return event; + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, + &event)); + return event; } template <> cl_event caffe_gpu_gemm(cl_command_queue *queue, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const int offA, const float* B, - const int offB, const float beta, float* C, const int offC) { - cl_event event; - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( - clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, - (cl_mem) C, - offC, ldc, 1, queue, 0, NULL, &event)); - return event; + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, const float* A, const int offA, + const float* B, const int offB, const float beta, float* C, + const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event)); + return event; } template <> cl_event caffe_gpu_gemm(cl_command_queue *queue, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const int offA, const double* B, - const int offB, const double beta, double* C, const int offC) { - cl_event event; - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - clblasTranspose transB = - (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = N; - CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, - (cl_mem) C, - offC, ldc, 1, queue, 0, NULL, &event)); - return event; + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const double alpha, const double* A, + const int offA, const double* B, const int offB, const double beta, + double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event)); + return event; } template <> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } template <> void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, size_t offA, int lda, - const float* x, size_t offx, const float beta, int incx, - float* y, size_t offy, int incy) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, - M, N, (cl_float) alpha, (cl_mem) A, offA, lda, - (cl_mem) x, offx, incx, (cl_float) beta, - (cl_mem) y, offy, incy, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + const int N, const float alpha, const float* A, size_t offA, int lda, + const float* x, size_t offx, const float beta, int incx, float* y, + size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_float) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, size_t offA, int lda, - const double* x, size_t offx, const double beta, int incx, - double* y, size_t offy, int incy) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( - clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, - offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, - incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + const int N, const double alpha, const double* A, size_t offA, int lda, + const double* x, size_t offx, const double beta, int incx, double* y, + size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA, - M, N, (cl_float) alpha, (cl_mem) A, 0, N, - (cl_mem) x, 0, 1, (cl_float) beta, - (cl_mem) y, 0, 1, - 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_float) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - clblasTranspose transA = - (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; - CLBLAS_CHECK( - clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, - N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_axpy(const int N, const float alpha, const float* X, - float* Y) { - cblas_saxpy(N, alpha, X, 1, Y, 1); + float* Y) { + cblas_saxpy(N, alpha, X, 1, Y, 1); } template <> void caffe_axpy(const int N, const double alpha, const double* X, - double* Y) { - cblas_daxpy(N, alpha, X, 1, Y, 1); + double* Y) { + cblas_daxpy(N, alpha, X, 1, Y, 1); } template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { - CLBLAS_CHECK( - clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + float* Y) { + CLBLAS_CHECK( + clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { - CLBLAS_CHECK( - clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); + double* Y) { + CLBLAS_CHECK( + clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> -void caffe_gpu_sgnbit(const int n, const float* x, float* y) - { +void caffe_gpu_sgnbit(const int n, const float* x, float* y) { } template <> -void caffe_gpu_sgnbit(const int n, const double* x, double* y) - { +void caffe_gpu_sgnbit(const int n, const double* x, double* y) { } template <> -void caffe_gpu_abs(const int n, const float* x, float* y) - { - caffe_gpu_abs_ocl(n, x, y); +void caffe_gpu_abs(const int n, const float* x, float* y) { + caffe_gpu_abs_ocl(n, x, y); } template <> -void caffe_gpu_abs(const int n, const double* x, double* y) - { - caffe_gpu_abs_ocl(n, x, y); +void caffe_gpu_abs(const int n, const double* x, double* y) { + caffe_gpu_abs_ocl(n, x, y); } template <> void caffe_set(const int N, const float alpha, float* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(float) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(float) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } template <> void caffe_set(const int N, const double alpha, double* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(double) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(double) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } } template <> void caffe_add_scalar(const int N, const double alpha, double* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } } template <> void caffe_copy(const int N, const float* X, float* Y) { - cblas_scopy(N, X, 1, Y, 1); + cblas_scopy(N, X, 1, Y, 1); } template <> void caffe_copy(const int N, const double* X, double* Y) { - cblas_dcopy(N, X, 1, Y, 1); + cblas_dcopy(N, X, 1, Y, 1); } //template -void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) - { - clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, - NULL, NULL); +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, + NULL, NULL); // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); } /* @@ -364,179 +353,170 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); */ template <> -void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) - { - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, - N, - 0, NULL, NULL)); +void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); } template <> -void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) - { - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, - N, - 0, NULL, NULL)); +void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); } template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } + if (X != Y) { + CLBLAS_CHECK( + clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } } template <> void caffe_gpu_copy(const int N, const double* X, double* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } + if (X != Y) { + CLBLAS_CHECK( + clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } } template <> void caffe_scal(const int N, const float alpha, float *X) { - cblas_sscal(N, alpha, X, 1); + cblas_sscal(N, alpha, X, 1); } template <> void caffe_scal(const int N, const double alpha, double *X) { - cblas_dscal(N, alpha, X, 1); + cblas_dscal(N, alpha, X, 1); } template <> void caffe_gpu_scal(const int N, const float alpha, float *X) { - CLBLAS_CHECK( - clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); + CLBLAS_CHECK( + clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } template <> void caffe_gpu_scal(const int N, const double alpha, double *X) { - CLBLAS_CHECK( - clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); + CLBLAS_CHECK( + clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } template <> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_cpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - cblas_saxpby(N, alpha, X, 1, beta, Y, 1); + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); } template <> void caffe_cpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - cblas_daxpby(N, alpha, X, 1, beta, Y, 1); + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); } template <> -void caffe_add(const int n, const float* a, const float* b, - float* y) { - vsAdd(n, a, b, y); +void caffe_add(const int n, const float* a, const float* b, float* y) { + vsAdd(n, a, b, y); } template <> void caffe_add(const int n, const double* a, const double* b, - double* y) { - vdAdd(n, a, b, y); + double* y) { + vdAdd(n, a, b, y); } template <> -void caffe_sub(const int n, const float* a, const float* b, - float* y) { - vsSub(n, a, b, y); +void caffe_sub(const int n, const float* a, const float* b, float* y) { + vsSub(n, a, b, y); } template <> void caffe_sub(const int n, const double* a, const double* b, - double* y) { - vdSub(n, a, b, y); + double* y) { + vdSub(n, a, b, y); } template <> -void caffe_mul(const int n, const float* a, const float* b, - float* y) { - vsMul(n, a, b, y); +void caffe_mul(const int n, const float* a, const float* b, float* y) { + vsMul(n, a, b, y); } template <> void caffe_mul(const int n, const double* a, const double* b, - double* y) { - vdMul(n, a, b, y); + double* y) { + vdMul(n, a, b, y); } template <> -void caffe_div(const int n, const float* a, const float* b, - float* y) { - vsDiv(n, a, b, y); +void caffe_div(const int n, const float* a, const float* b, float* y) { + vsDiv(n, a, b, y); } template <> void caffe_div(const int n, const double* a, const double* b, - double* y) { - vdDiv(n, a, b, y); + double* y) { + vdDiv(n, a, b, y); } template <> -void caffe_powx(const int n, const float* a, const float b, - float* y) { - vsPowx(n, a, b, y); +void caffe_powx(const int n, const float* a, const float b, float* y) { + vsPowx(n, a, b, y); } template <> void caffe_powx(const int n, const double* a, const double b, - double* y) { - vdPowx(n, a, b, y); + double* y) { + vdPowx(n, a, b, y); } template <> void caffe_sqr(const int n, const float* a, float* y) { - vsSqr(n, a, y); + vsSqr(n, a, y); } template <> void caffe_sqr(const int n, const double* a, double* y) { - vdSqr(n, a, y); + vdSqr(n, a, y); } template <> void caffe_exp(const int n, const float* a, float* y) { - vsExp(n, a, y); + vsExp(n, a, y); } template <> void caffe_exp(const int n, const double* a, double* y) { - vdExp(n, a, y); + vdExp(n, a, y); } unsigned int caffe_rng_rand() { - return (*caffe_rng())(); + return (*caffe_rng())(); } template Dtype caffe_nextafter(const Dtype b) { - return boost::math::nextafter < Dtype > ( - b, std::numeric_limits < Dtype > ::max()); + return boost::math::nextafter < Dtype + > (b, std::numeric_limits < Dtype > ::max()); } template @@ -547,62 +527,62 @@ double caffe_nextafter(const double b); template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_LE(a, b); - boost::uniform_real < Dtype - > random_distribution(a, caffe_nextafter(b)); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_LE(a, b); + boost::uniform_real < Dtype + > random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } - //LOG(INFO) << "caffe_rng_uniform"; + //LOG(INFO) << "caffe_rng_uniform"; } template void caffe_rng_uniform(const int n, const float a, const float b, - float* r); + float* r); template void caffe_rng_uniform(const int n, const double a, const double b, - double* r); + double* r); template -void caffe_rng_gaussian(const int n, const Dtype a, - const Dtype sigma, Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GT(sigma, 0); - boost::normal_distribution < Dtype > random_distribution(a, sigma); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } +void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma, + Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GT(sigma, 0); + boost::normal_distribution < Dtype > random_distribution(a, sigma); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } } template -void caffe_rng_gaussian(const int n, const float mu, - const float sigma, float* r); +void caffe_rng_gaussian(const int n, const float mu, const float sigma, + float* r); template void caffe_rng_gaussian(const int n, const double mu, - const double sigma, double* r); + const double sigma, double* r); template void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution < Dtype > random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } } template @@ -613,16 +593,16 @@ void caffe_rng_bernoulli(const int n, const float p, int* r); template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution < Dtype > random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = static_cast(variate_generator()); - } + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } } template @@ -633,104 +613,104 @@ void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); // template <> float caffe_cpu_dot(const int n, const float* x, const float* y) { - return cblas_sdot(n, x, 1, y, 1); + return cblas_sdot(n, x, 1, y, 1); } template <> double caffe_cpu_dot(const int n, const double* x, const double* y) { - return cblas_ddot(n, x, 1, y, 1); + return cblas_ddot(n, x, 1, y, 1); } template <> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(float)), NULL, NULL); - cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(float)), NULL, NULL); - clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), - out, 0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_out); + float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(float)), NULL, NULL); + clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); } template <> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { - //need to pass in scratchBuff - //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(double)), NULL, NULL); - cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(double)), NULL, NULL); - clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), - out, 0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_out); + double * out) { + //need to pass in scratchBuff + //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(double)), NULL, NULL); + clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); } template <> int caffe_cpu_hamming_distance(const int n, const float* x, - const float* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcount(static_cast(x[i]) ^ - static_cast(y[i])); - } - return dist; + const float* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcount( + static_cast(x[i]) ^ static_cast(y[i])); + } + return dist; } template <> int caffe_cpu_hamming_distance(const int n, const double* x, - const double* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcountl(static_cast(x[i]) ^ - static_cast(y[i])); - } - return dist; + const double* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcountl( + static_cast(x[i]) ^ static_cast(y[i])); + } + return dist; } template <> float caffe_cpu_asum(const int n, const float* x) { - return cblas_sasum(n, x, 1); + return cblas_sasum(n, x, 1); } template <> double caffe_cpu_asum(const int n, const double* x) { - return cblas_dasum(n, x, 1); + return cblas_dasum(n, x, 1); } template <> void caffe_gpu_asum(const int n, const float* x, float* y) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(cl_float)), NULL, NULL); - cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(cl_float)), NULL, NULL); - clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, - 0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_y); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_float)), NULL, NULL); + clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, + 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } template <> void caffe_gpu_asum(const int n, const double* x, double* y) { - cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (n * sizeof(cl_double)), NULL, NULL); - cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - (1 * sizeof(cl_double)), NULL, NULL); - clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL); - clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), - y, 0, NULL, NULL); - clReleaseMemObject(scratchBuff); - clReleaseMemObject(d_y); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_double)), NULL, NULL); + clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), + y, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); } //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) @@ -743,30 +723,30 @@ INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); template <> void caffe_cpu_scale(const int n, const float alpha, const float *x, - float* y) { - cblas_scopy(n, x, 1, y, 1); - cblas_sscal(n, alpha, y, 1); + float* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); } template <> void caffe_cpu_scale(const int n, const double alpha, const double *x, - double* y) { - cblas_dcopy(n, x, 1, y, 1); - cblas_dscal(n, alpha, y, 1); + double* y) { + cblas_dcopy(n, x, 1, y, 1); + cblas_dscal(n, alpha, y, 1); } template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { - caffe_gpu_copy(n, x, y); - caffe_gpu_scal(n, alpha, y); + float* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { - caffe_gpu_copy(n, x, y); - caffe_gpu_scal(n, alpha, y); + double* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); } template @@ -775,114 +755,112 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) { template <> void caffe_gpu_set(const int N, const float alpha, float* Y) { - ocl_memset(Y, alpha, N); + ocl_memset(Y, alpha, N); } template <> void caffe_gpu_set(const int N, const double alpha, double* Y) { - ocl_memset(Y, alpha, N); + ocl_memset(Y, alpha, N); } template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { - kernel_add_scalar(N, alpha, Y); + kernel_add_scalar(N, alpha, Y); } template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { - kernel_add_scalar(N, alpha, Y); + kernel_add_scalar(N, alpha, Y); } template <> void caffe_gpu_exp(const int N, const float* a, float* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } template <> void caffe_gpu_exp(const int N, const double* a, double* y) { - kernel_exp(N, a, y); + kernel_exp(N, a, y); } template <> void caffe_gpu_sign(const int N, const float *X, float *Y) { - caffe_gpu_sign_ocl(N, X, Y); + caffe_gpu_sign_ocl(N, X, Y); } template <> void caffe_gpu_sign(const int N, const double *X, double *Y) { - caffe_gpu_sign_ocl(N, X, Y); + caffe_gpu_sign_ocl(N, X, Y); } template <> void caffe_gpu_sub(const int N, const float* a, const float* b, - float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_sub(N, a, b, y); + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); } template <> void caffe_gpu_sub(const int N, const double* a, const double* b, - double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_sub(N, a, b, y); + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); } template <> -void caffe_gpu_mul(const int N, const float* a, - const float* b, float* y) { - kernel_mul(N, a, b, y); +void caffe_gpu_mul(const int N, const float* a, const float* b, + float* y) { + kernel_mul(N, a, b, y); } template <> -void caffe_gpu_mul(const int N, const double* a, - const double* b, double* y) { - kernel_mul(N, a, b, y); +void caffe_gpu_mul(const int N, const double* a, const double* b, + double* y) { + kernel_mul(N, a, b, y); } template <> -void caffe_gpu_div(const int N, const float* a, - const float* b, float* y) { - kernel_div(N, a, b, y); +void caffe_gpu_div(const int N, const float* a, const float* b, + float* y) { + kernel_div(N, a, b, y); } template <> -void caffe_gpu_div(const int N, const double* a, - const double* b, double* y) { - kernel_div(N, a, b, y); +void caffe_gpu_div(const int N, const double* a, const double* b, + double* y) { + kernel_div(N, a, b, y); } template <> -void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_powx(N, a, alpha, y); +void caffe_gpu_powx(const int N, const float* a, const float alpha, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); } template <> -void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_powx(N, a, alpha, y); +void caffe_gpu_powx(const int N, const double* a, const double alpha, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); } -void popc_kernel(const int n, const float* a, - const float* b, uint8_t* y) { +void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) { } -void popcll_kernel(const int n, const double* a, - const double* b, uint8_t* y) { +void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) { } template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, - const float* y) { - return 0; + const float* y) { + return 0; } template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, - const double* y) { - return 0; + const double* y) { + return 0; } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { @@ -890,116 +868,116 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) { template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, - float* r) { - caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object + float* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object } template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, - double* r) { - caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object + double* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object } template <> -void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, - float* r) { - caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object +void caffe_gpu_rng_gaussian(const int n, const float mu, + const float sigma, float* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object } template <> -void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, - double* r) { - caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object +void caffe_gpu_rng_gaussian(const int n, const double mu, + const double sigma, double* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object } template <> void caffe_gpu_log(const int N, const float* a, float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_log(N, a, y); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); } template <> void caffe_gpu_log(const int N, const double* a, double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_log(N, a, y); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); } template <> void caffe_log(const int n, const float* a, float* y) { - vsLn(n, a, y); + vsLn(n, a, y); } template <> void caffe_log(const int n, const double* a, double* y) { - vdLn(n, a, y); + vdLn(n, a, y); } template void caffe_copy(const int N, const Dtype* X, Dtype* Y) { - if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); + // NOLINT_NEXT_LINE(caffe/alt_fn) + //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); #else - NO_GPU; + NO_GPU; #endif - } else { - memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } - } + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } } template void caffe_copy(const int N, const int* X, int* Y); template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); + unsigned int* Y); template void caffe_copy(const int N, const float* X, float* Y); template void caffe_copy(const int N, const double* X, double* Y); template <> void caffe_abs(const int n, const float* a, float* y) { - vsAbs(n, a, y); + vsAbs(n, a, y); } template <> void caffe_abs(const int n, const double* a, double* y) { - vdAbs(n, a, y); + vdAbs(n, a, y); } template <> void caffe_gpu_add(const int N, const float* a, const float* b, - float* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_add(N, a, b, y); + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); } template <> void caffe_gpu_add(const int N, const double* a, const double* b, - double* y) { - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_add(N, a, b, y); + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); } template <> float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { - return cblas_sdot(n, x, incx, y, incy); + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); } template <> double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { - return cblas_ddot(n, x, incx, y, incy); + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); } template void caffe_set(const int N, const Dtype alpha, Dtype* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } } template void caffe_set(const int N, const int alpha, int* Y); diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu index 64245bea..ae71de0f 100644 --- a/src/caffe/util/math_functions.cu +++ b/src/caffe/util/math_functions.cu @@ -14,150 +14,154 @@ namespace caffe { template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK( + cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, N)); } template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const double alpha, const double* A, const double* B, const double beta, - double* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK( + cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, N)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, + &beta, y, 1)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, + &beta, y, 1)); } template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, - float* Y) { - CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); + float* Y) { + CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } template <> void caffe_gpu_axpy(const int N, const double alpha, const double* X, - double* Y) { - CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); + double* Y) { + CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) { - if (X != Y) { - CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) - } + if (X != Y) { + CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) + } } template <> void caffe_gpu_scal(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } template <> void caffe_gpu_scal(const int N, const double alpha, double *X) { - CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } template <> void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_gpu_dot(const int n, const float* x, const float* y, - float* out) { - CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); + float* out) { + CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } template <> void caffe_gpu_dot(const int n, const double* x, const double* y, - double * out) { - CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); + double * out) { + CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } template <> void caffe_gpu_asum(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); } template <> void caffe_gpu_asum(const int n, const double* x, double* y) { - CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); } template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { - CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); + float* y) { + CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { - CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); + double* y) { + CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = alpha; - } + CUDA_KERNEL_LOOP(index, n) { + y[index] = alpha; + } } template void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { - if (alpha == 0) { - CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) - return; - } - // NOLINT_NEXT_LINE(whitespace/operators) + if (alpha == 0) { + CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) + return; + } + // NOLINT_NEXT_LINE(whitespace/operators) set_kernel<<>>( - N, alpha, Y); + N, alpha, Y); } template void caffe_gpu_set(const int N, const int alpha, int* Y); @@ -167,7 +171,7 @@ template void caffe_gpu_set(const int N, const double alpha, double* Y); template __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { - y[index] += alpha; + y[index] += alpha; } } @@ -175,7 +179,7 @@ template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { // NOLINT_NEXT_LINE(whitespace/operators) add_scalar_kernel<<>>( - N, alpha, Y); + N, alpha, Y); } template <> @@ -242,16 +246,16 @@ y[index] = a[index] * b[index]; } template <> -void caffe_gpu_mul(const int N, const float* a, -const float* b, float* y) { +void caffe_gpu_mul(const int N, const float* a, const float* b, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) mul_kernel<<>>( N, a, b, y); } template <> -void caffe_gpu_mul(const int N, const double* a, -const double* b, double* y) { +void caffe_gpu_mul(const int N, const double* a, const double* b, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) mul_kernel<<>>( N, a, b, y); @@ -266,16 +270,16 @@ y[index] = a[index] / b[index]; } template <> -void caffe_gpu_div(const int N, const float* a, -const float* b, float* y) { +void caffe_gpu_div(const int N, const float* a, const float* b, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) div_kernel<<>>( N, a, b, y); } template <> -void caffe_gpu_div(const int N, const double* a, -const double* b, double* y) { +void caffe_gpu_div(const int N, const double* a, const double* b, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) div_kernel<<>>( N, a, b, y); @@ -353,16 +357,16 @@ y[index] = pow(a[index], alpha); } template <> -void caffe_gpu_powx(const int N, const float* a, -const float alpha, float* y) { +void caffe_gpu_powx(const int N, const float* a, const float alpha, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) powx_kernel<<>>( N, a, alpha, y); } template <> -void caffe_gpu_powx(const int N, const double* a, -const double alpha, double* y) { +void caffe_gpu_powx(const int N, const double* a, const double alpha, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) powx_kernel<<>>( N, a, alpha, y); @@ -372,21 +376,21 @@ DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - (x[index] < Dtype(0))); DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); -__global__ void popc_kernel(const int n, const float* a, -const float* b, uint8_t* y) { +__global__ void popc_kernel(const int n, const float* a, const float* b, +uint8_t* y) { CUDA_KERNEL_LOOP(index, n) { -y[index] = __popc(static_cast(a[index]) ^ -static_cast(b[index])); +y[index] = __popc( +static_cast(a[index]) ^ static_cast(b[index])); } } -__global__ void popcll_kernel(const int n, const double* a, -const double* b, uint8_t* y) { +__global__ void popcll_kernel(const int n, const double* a, const double* b, +uint8_t* y) { CUDA_KERNEL_LOOP(index, n) { -y[index] = __popcll(static_cast(a[index]) ^ -static_cast(b[index])); +y[index] = __popcll( +static_cast(a[index]) ^ static_cast(b[index])); } } @@ -394,24 +398,24 @@ template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, const float* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test - // TestHammingDistanceGPU in test_math_functions.cpp). + // TestHammingDistanceGPU in test_math_functions.cpp). NOT_IMPLEMENTED; thrust::device_vector < uint8_t > popcounts(n); - // NOLINT_NEXT_LINE(whitespace/operators) + // NOLINT_NEXT_LINE(whitespace/operators) popc_kernel<<>>( n, x, y, thrust::raw_pointer_cast(popcounts.data())); -return thrust::reduce(popcounts.begin(), popcounts.end(), -(uint32_t) 0, thrust::plus()); +return thrust::reduce(popcounts.begin(), popcounts.end(), (uint32_t) 0, +thrust::plus()); } template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, const double* y) { - // TODO: Fix caffe_gpu_hamming_distance (see failing unit test - // TestHammingDistanceGPU in test_math_functions.cpp). + // TODO: Fix caffe_gpu_hamming_distance (see failing unit test + // TestHammingDistanceGPU in test_math_functions.cpp). NOT_IMPLEMENTED; thrust::device_vector < uint8_t > popcounts(n); - // NOLINT_NEXT_LINE(whitespace/operators) + // NOLINT_NEXT_LINE(whitespace/operators) popcll_kernel<<>>( n, x, y, thrust::raw_pointer_cast(popcounts.data())); return thrust::reduce(popcounts.begin(), popcounts.end(), @@ -452,8 +456,7 @@ caffe_gpu_add_scalar(n, a, r); template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, float* r) { -CURAND_CHECK( -curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); +CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); } template <> diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 8f44a106..6b5045d8 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -36,56 +36,56 @@ template extern std::string get_dtype_suffix(); template void ocl_memset(Dtype* buffer, const Dtype value, const int count) { - std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int err = 0; - err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); - err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value); - err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); - OCL_CHECK(err); + std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int err = 0; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + OCL_CHECK(err); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ocl_memset(int* buffer, const int value, const int count); template void ocl_memset(float* buffer, const float value, - const int count); + const int count); template void ocl_memset(double* buffer, const double value, - const int count); + const int count); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, - const int count) { - cl_int err; - err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); - err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); - err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); - OCL_CHECK(err); + const int count) { + cl_int err; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + OCL_CHECK(err); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } void eventCallback(cl_event event, cl_int event_status, void* user_data) { - cl_ulong ev_start_time = (cl_ulong) 0; - cl_ulong ev_end_time = (cl_ulong) 0; - double run_time; - OCL_CHECK( - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, - sizeof(cl_ulong), &ev_start_time, NULL)); - OCL_CHECK( - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), - &ev_end_time, NULL)); - run_time = (double) (ev_end_time - ev_start_time); - printf("The kernel's running time is %f s\n", run_time * 1.0e-9); + cl_ulong ev_start_time = (cl_ulong) 0; + cl_ulong ev_end_time = (cl_ulong) 0; + double run_time; + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &ev_start_time, NULL)); + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), + &ev_end_time, NULL)); + run_time = (double) (ev_end_time - ev_start_time); + printf("The kernel's running time is %f s\n", run_time * 1.0e-9); } } // namespace caffe diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 75b69215..5844fb84 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -35,1899 +35,1898 @@ namespace caffe { typedef unsigned int uint32_t; struct array4x32 { - uint32_t v[4]; + uint32_t v[4]; }; template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, - Dtype threshold) { - std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds); - ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size); - OCL_CHECK(ret); - - size_t globalws[1] = { size }; - size_t localws[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, - globalws, - localws, 0, NULL, NULL)); + Dtype threshold) { + std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, localws, 0, NULL, NULL)); } template void caffe_gpu_bernoulli(int* a, const unsigned int n, - float inf, float sup, float threshold); + float inf, float sup, float threshold); template void caffe_gpu_bernoulli(int* a, const unsigned int n, - double inf, double sup, double threshold); + double inf, double sup, double threshold); template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, - const int M_, const int packing_num) { - std::string kernel_name = "transform" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) }; - size_t uiLocal_Work_Size2[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); + const int M_, const int packing_num) { + std::string kernel_name = "transform" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); } template void transform_gpu(float* src, float* dst, const int top_offset, - const int N_, const int M_, const int packing_num); + const int N_, const int M_, const int packing_num); template void transform_gpu(double* src, double* dst, - const int top_offset, const int N_, const int M_, const int packing_num); + const int top_offset, const int N_, const int M_, const int packing_num); template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* bottom_data, Dtype* scale_data) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data)); + const Dtype* bottom_data, Dtype* scale_data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data)); - size_t Global_Work_Size[1] = { (size_t) num }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, - const float* bottom_data, float* scale_data); + const float* bottom_data, float* scale_data); template void get_max_gpu(cl_kernel Kernel, const int num, - const int dim, const double* bottom_data, double* scale_data); -template -void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) -{ - std::string kernel_name = "RNGUniform" + get_dtype_suffix(); - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); - OCL_CHECK(ret); - - size_t globalws[1] = {size}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); -} -template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup); -template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup); - -void caffe_gpu_uniform(const unsigned int n, unsigned int *r) -{ - std::string kernel_name = "PRNG_threefry4x32_uint_uniform"; - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_uint inf = 0; - cl_uint sup = UINT_MAX; - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&r); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint), (void*)&inf); - ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint), (void*)&sup); - ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); - OCL_CHECK(ret); - - size_t globalws[1] = {size}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); -} - -template -void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) -{ - std::string kernel_name = "RNGGaussian" + get_dtype_suffix(); - cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - - static unsigned c = 0; - unsigned nrounds = 20; - array4x32 rndctr4; - rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; - cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 - - cl_int ret; - ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); - ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); - ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&E); - ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&V); - ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); - ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); - OCL_CHECK(ret); - - size_t globalws[1] = {size}; - size_t localws[1] = {256}; - OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); -} -template void caffe_gpu_gaussian(float* a, const unsigned int n, float E, float V); -template void caffe_gpu_gaussian(double* a, const unsigned int n, double E, double V); + const int dim, const double* bottom_data, double* scale_data); +template +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) { + std::string kernel_name = "RNGUniform" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, localws, 0, NULL, NULL)); +} +template void caffe_gpu_uniform(float* a, const unsigned int n, + float inf, float sup); +template void caffe_gpu_uniform(double* a, const unsigned int n, + double inf, double sup); + +void caffe_gpu_uniform(const unsigned int n, unsigned int *r) { + std::string kernel_name = "PRNG_threefry4x32_uint_uniform"; + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_uint inf = 0; + cl_uint sup = UINT_MAX; + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &r); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint), (void*) &inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint), (void*) &sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, localws, 0, NULL, NULL)); +} + +template +void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) { + std::string kernel_name = "RNGGaussian" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &E); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &V); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, localws, 0, NULL, NULL)); +} +template void caffe_gpu_gaussian(float* a, const unsigned int n, float E, + float V); +template void caffe_gpu_gaussian(double* a, const unsigned int n, + double E, double V); template void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) num }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void exp_gpu(cl_kernel Kernel, const int num, const float* data, - float* out); + float* out); template void exp_gpu(cl_kernel Kernel, const int num, - const double* data, double* out); + const double* data, double* out); template void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* scale, Dtype* data) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + const Dtype* scale, Dtype* data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); - size_t Global_Work_Size[1] = { (size_t)(num * dim) }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t)(num * dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void softmax_div_gpu(cl_kernel Kernel, const int num, - const int dim, const float* scale, float* data); + const int dim, const float* scale, float* data); template void softmax_div_gpu(cl_kernel Kernel, const int num, - const int dim, const double* scale, double* data); + const int dim, const double* scale, double* data); template Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, - const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { + const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); - size_t globalws[1] = { 256 }; - size_t localws[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, - localws, 0, NULL, NULL)); - void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, - CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); - Dtype loss = *(Dtype*) h_loss; - clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, - NULL); + size_t globalws[1] = { 256 }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, + localws, 0, NULL, NULL)); + void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, + CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); + Dtype loss = *(Dtype*) h_loss; + clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, + NULL); - return loss; + return loss; } template float softmax_gpu(cl_kernel Kernel, const int num, - const int dim, const float* prob_data, const float* label, cl_mem d_loss); + const int dim, const float* prob_data, const float* label, cl_mem d_loss); template double softmax_gpu(cl_kernel Kernel, const int num, - const int dim, const double* prob_data, const double* label, cl_mem d_loss); + const int dim, const double* prob_data, const double* label, cl_mem d_loss); template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) { - std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + const int spatial_dim, const Dtype* data, Dtype* out) { + std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const float* data, float* out); + const int spatial_dim, const float* data, float* out); template void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const double* data, double* out); + const int spatial_dim, const double* data, double* out); template -void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) { - std::string kernel_name = "kernel_channel_subtract" - + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); +void kernel_channel_subtract(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data) { + std::string kernel_name = "kernel_channel_subtract" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } -template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const float* channel_max, float* data); -template void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const double* channel_max, double* data); +template void kernel_channel_subtract(const int count, const int num, + const int channels, const int spatial_dim, const float* channel_max, + float* data); +template void kernel_channel_subtract(const int count, const int num, + const int channels, const int spatial_dim, const double* channel_max, + double* data); template -void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) - { - std::string kernel_name = "kernel_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); +void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) { + std::string kernel_name = "kernel_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_mul(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_mul(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); template void kernel_add_scalar(const int count, const Dtype data, Dtype* out) { - std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_add_scalar(const int count, const float data, - float* out); + float* out); template void kernel_add_scalar(const int count, const double data, - double* out); + double* out); template void kernel_powx(const int count, const Dtype* data, const Dtype alpha, - Dtype* out) { - std::string kernel_name = "kernel_powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + Dtype* out) { + std::string kernel_name = "kernel_powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_powx(const int count, const float* data, - const float alpha, float* out); + const float alpha, float* out); template void kernel_powx(const int count, const double* data, - const double alpha, double* out); + const double alpha, double* out); template void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) { - std::string kernel_name = "kernel_div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_div(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_div(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); template void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) { - std::string kernel_name = "kernel_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_add(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_add(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); template void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) { - std::string kernel_name = "kernel_sub" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_sub" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_sub(const int count, const float* a, const float* b, - float* out); + float* out); template void kernel_sub(const int count, const double* a, - const double* b, double* out); + const double* b, double* out); template void kernel_log(const int count, const Dtype* data, Dtype* out) { - std::string kernel_name = "kernel_log" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_log" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_log(const int count, const float* data, float* out); template void kernel_log(const int count, const double* data, - double* out); + double* out); template void kernel_exp(const int count, const Dtype* data, Dtype* out) { - std::string kernel_name = "kernel_exp" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "kernel_exp" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_exp(const int count, const float* data, float* out); template void kernel_exp(const int count, const double* data, - double* out); + double* out); template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) { - std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + const int spatial_dim, const Dtype* data, Dtype* channel_sum) { + std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); - size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const float* data, float* channel_sum); + const int spatial_dim, const float* data, float* channel_sum); template void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const double* data, double* channel_sum); + const int spatial_dim, const double* data, double* channel_sum); template void kernel_channel_div(const int count, const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) { - std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + const int spatial_dim, const Dtype* channel_sum, Dtype* data) { + std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); - size_t Global_Work_Size[1] = { (size_t) count }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_div(const int count, const int num, - const int channels, - const int spatial_dim, const float* channel_sum, float* data); + const int channels, const int spatial_dim, const float* channel_sum, + float* data); template void kernel_channel_div(const int count, const int num, - const int channels, - const int spatial_dim, const double* channel_sum, double* data); + const int channels, const int spatial_dim, const double* channel_sum, + double* data); template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) { - std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot) { + std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot)); - size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const float* data_1, const float* data_2, - float* channel_dot); + const int spatial_dim, const float* data_1, const float* data_2, + float* channel_dot); template void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const double* data_1, const double* data_2, - double* channel_dot); - -template -void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) { - std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int int_has_ignore_label = has_ignore_label_ ? 1 : 0; - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); - OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); - - size_t Global_Work_Size[1] = { (size_t) nthreads }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int spatial_dim, const double* data_1, const double* data_2, + double* channel_dot); + +template +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, + const Dtype* label, Dtype* loss, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) { + std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SoftmaxLossForwardGPU(const int nthreads, - const float* prob_data, const float* label, float* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, float* counts); + const float* prob_data, const float* label, float* loss, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, float* counts); template void SoftmaxLossForwardGPU(const int nthreads, - const double* prob_data, const double* label, double* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, double* counts); + const double* prob_data, const double* label, double* loss, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, double* counts); template void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) { - std::string kernel_name = "SoftmaxLossBackwardGPU" - + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int int_has_ignore_label = has_ignore_label_ ? 1 : 0; - - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff)); - OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); - OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); - OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); - OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); - - size_t Global_Work_Size[1] = { (size_t) nthreads }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) { + std::string kernel_name = "SoftmaxLossBackwardGPU" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SoftmaxLossBackwardGPU(const int nthreads, - const float* top, const float* label, float* bottom_diff, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, float* counts); + const float* top, const float* label, float* bottom_diff, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, float* counts); template void SoftmaxLossBackwardGPU(const int nthreads, - const double* top, const double* label, double* bottom_diff, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, double* counts); + const double* top, const double* label, double* bottom_diff, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, double* counts); template void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); - size_t Global_Work_Size[1] = { (size_t) num }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void scal_gpu(cl_kernel Kernel, const int num, - const float alpha, float* data); + const float alpha, float* data); template void scal_gpu(cl_kernel Kernel, const int num, - const double alpha, double* data); + const double alpha, double* data); template void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, - const Dtype* label) { - OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); - OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); - OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); - OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label)); + const Dtype* label) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label)); - size_t Global_Work_Size[1] = { (size_t) num }; - size_t Local_Work_Size[1] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void diff_gpu(cl_kernel Kernel, const int num, const int dim, - float* data, const float* label); + float* data, const float* label); template void diff_gpu(cl_kernel Kernel, const int num, const int dim, - double* data, const double* label); + double* data, const double* label); template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - Dtype* top_data) { - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + float* top_data); template void max_pool_fp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + double* top_data); template void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, - Dtype* top_mask) { - std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask); - ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask) { + std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxPoolForward(const int count, const float* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, float* top_data, int* mask, - float* top_mask); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data, int* mask, + float* top_mask); template void MaxPoolForward(const int count, const double* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, double* top_data, int* mask, - double* top_mask); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data, int* mask, + double* top_mask); template void StoPoolForwardTrain(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* idx_data, Dtype* top_data) { - std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data) { + std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void StoPoolForwardTrain(const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, float* idx_data, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* idx_data, float* top_data); template void StoPoolForwardTrain(const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, double* idx_data, - double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* idx_data, + double* top_data); template void StoPoolForwardTest(const int count, const Dtype* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - Dtype* top_data) { - std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data) { + std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void StoPoolForwardTest(const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* top_data); template void StoPoolForwardTest(const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_h_, const int kernel_w_, - const int stride_h_, const int stride_w_, double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* top_data); template void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, - const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, Dtype* top_data) { - std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data) { + std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void AvePoolForward(const int count, const float* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, float* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data); template void AvePoolForward(const int count, const double* bottom_data, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_h_, - const int kernel_w_, const int stride_h_, const int stride_w_, - const int pad_h_, const int pad_w_, double* top_data); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, Dtype* top_data) { - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, float* top_data); + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* top_data); template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const int clnum, const int channels_, - const int height_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, double* top_data); + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* top_data); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, Dtype* bottom_diff) { - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const float* bottom_data, const float* top_data, const float* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, float* bottom_diff); + const float* bottom_data, const float* top_data, const float* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, float* bottom_diff); template void max_pool_bp_gpu(cl_kernel Kernel, const int count, - const double* bottom_data, const double* top_data, const double* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, double* bottom_diff); + const double* bottom_data, const double* top_data, const double* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, double* bottom_diff); template void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { - std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h); - ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w); - ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void MaxPoolBackward(const int nthreads, - const float* const top_diff, const int* const mask, - const float* const top_mask, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - float* const bottom_diff); + const float* const top_diff, const int* const mask, + const float* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); template void MaxPoolBackward(const int nthreads, - const double* const top_diff, const int* const mask, - const double* const top_mask, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - double* const bottom_diff); + const double* const top_diff, const int* const mask, + const double* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); template void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { - std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w); - ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void AvePoolBackward(const int nthreads, - const float* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - float* const bottom_diff); + const float* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); template void AvePoolBackward(const int nthreads, - const double* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - double* const bottom_diff); + const double* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); template void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, - const Dtype* const top_diff, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, Dtype* const bottom_diff) { - std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff) { + std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void StoPoolBackward(const int nthreads, - const float* const rand_idx, const float* const top_diff, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, - float* const bottom_diff); + const float* const rand_idx, const float* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + float* const bottom_diff); template void StoPoolBackward(const int nthreads, - const double* const rand_idx, const double* const top_diff, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, - double* const bottom_diff); + const double* const rand_idx, const double* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + double* const bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, - const int clnum, const int channels_, const int height_, const int width_, - const int pooled_height_, const int pooled_width_, const int kernel_size_, - const int stride_, const int pad_, Dtype* bottom_diff) { - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, - const float* top_diff, const int clnum, const int channels_, - const int intheight_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, float* bottom_diff); + const float* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* bottom_diff); template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, - const double* top_diff, const int clnum, const int channels_, - const int intheight_, const int width_, const int pooled_height_, - const int pooled_width_, const int kernel_size_, const int stride_, - const int pad_, double* bottom_diff); + const double* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* bottom_diff); template void PReLUForward(const int count, const int channels, const int dim, - const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, - const int div_factor) { - std::string kernel_name = "PReLUForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor) { + std::string kernel_name = "PReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUForward(const int count, const int channels, - const int dim, const float* bottom_data, float* top_data, - const float* slope_data, const int div_factor); + const int dim, const float* bottom_data, float* top_data, + const float* slope_data, const int div_factor); template void PReLUForward(const int count, const int channels, - const int dim, const double* bottom_data, double* top_data, - const double* slope_data, const int div_factor); + const int dim, const double* bottom_data, double* top_data, + const double* slope_data, const int div_factor); template void PReLUBackward(const int count, const int channels, const int dim, - const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, - const Dtype* slope_data, const int div_factor) { - std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor) { + std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUBackward(const int count, const int channels, - const int dim, const float* top_diff, const float* bottom_data, - float* bottom_diff, const float* slope_data, const int div_factor); + const int dim, const float* top_diff, const float* bottom_data, + float* bottom_diff, const float* slope_data, const int div_factor); template void PReLUBackward(const int count, const int channels, - const int dim, const double* top_diff, const double* bottom_data, - double* bottom_diff, const double* slope_data, const int div_factor); + const int dim, const double* top_diff, const double* bottom_data, + double* bottom_diff, const double* slope_data, const int div_factor); template void PReLUParamBackward(const int count, const Dtype* top_diff, - const int offset_out, const Dtype* bottom_data, const int offset_in, - Dtype* bottom_diff) { - std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); - ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff) { + std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void PReLUParamBackward(const int count, const float* top_diff, - const int offset_out, const float* bottom_data, const int offset_in, - float* bottom_diff); + const int offset_out, const float* bottom_data, const int offset_in, + float* bottom_diff); template void PReLUParamBackward(const int count, - const double* top_diff, const int offset_out, const double* bottom_data, - const int offset_in, double* bottom_diff); + const double* top_diff, const int offset_out, const double* bottom_data, + const int offset_in, double* bottom_diff); template void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, - Dtype negative_slope) { - std::string kernel_name = "ReLUForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + Dtype negative_slope) { + std::string kernel_name = "ReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ReLUForward(const int count, const float* bottom_data, - float* top_data, float negative_slope); + float* top_data, float negative_slope); template void ReLUForward(const int count, const double* bottom_data, - double* top_data, double negative_slope); + double* top_data, double negative_slope); template void ReLUBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { - std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); - ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { + std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void ReLUBackward(const int count, const float* top_diff, - const float* bottom_data, float* bottom_diff, float negative_slope); + const float* bottom_data, float* bottom_diff, float negative_slope); template void ReLUBackward(const int count, const double* top_diff, - const double* bottom_data, double* bottom_diff, double negative_slope); + const double* bottom_data, double* bottom_diff, double negative_slope); template void SigmoidForward(const int count, const Dtype* bottom_data, - Dtype* top_data) { - std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + Dtype* top_data) { + std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void SigmoidForward(const int count, const float* bottom_data, - float* top_data); + float* top_data); template void SigmoidForward(const int count, const double* bottom_data, - double* top_data); + double* top_data); template void SigmoidBackward(const int count, const Dtype* top_diff, - const Dtype* top_data, Dtype* bottom_diff) { - std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const Dtype* top_data, Dtype* bottom_diff) { + std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void SigmoidBackward(const int count, const float* top_diff, - const float* top_data, float* bottom_diff); + const float* top_data, float* bottom_diff); template void SigmoidBackward(const int count, const double* top_diff, - const double* top_data, double* bottom_diff); + const double* top_data, double* bottom_diff); template void ThresholdForward(const int count, const Dtype threshold, - const Dtype* bottom_data, Dtype* top_data) { - std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, Dtype* top_data) { + std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void ThresholdForward(const int count, const float threshold, - const float* bottom_data, float* top_data); + const float* bottom_data, float* top_data); template void ThresholdForward(const int count, const double threshold, - const double* bottom_data, double* top_data); + const double* bottom_data, double* top_data); template void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) { - std::string kernel_name = "TanHForward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "TanHForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void TanHForward(const int count, const float* bottom_data, - float* top_data); + float* top_data); template void TanHForward(const int count, const double* bottom_data, - double* top_data); + double* top_data); template void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, - Dtype* bottom_diff) { - std::string kernel_name = "TanHBackward" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) count }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + Dtype* bottom_diff) { + std::string kernel_name = "TanHBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void TanHBackward(const int count, const float* top_diff, - const float* top_data, float* bottom_diff); + const float* top_data, float* bottom_diff); template void TanHBackward(const int count, const double* top_diff, - const double* top_data, double* bottom_diff); + const double* top_data, double* bottom_diff); template void opttrans(const Dtype* data_im, const int im_offset, const int channels, - const int height, const int width, Dtype* data_opt, const int opt_offset, - const int optnum) { - std::string kernel_name = "opttrans" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int num_kernels = channels * height * width * optnum; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int num_kernels = channels * height * width * optnum; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void opttrans(const float* data_im, const int im_offset, - const int channels, - const int height, const int width, float* data_opt, const int opt_offset, - const int optnum); + const int channels, const int height, const int width, float* data_opt, + const int opt_offset, const int optnum); template void opttrans(const double* data_im, const int im_offset, - const int channels, - const int height, const int width, double* data_opt, const int opt_offset, - const int optnum); - -template -void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale) { - std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); - cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); - ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); - ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); - ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); - ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + const int channels, const int height, const int width, double* data_opt, + const int opt_offset, const int optnum); + +template +void LRNFillScale(const int nthreads, const Dtype* const in, const int num, + const int channels, const int height, const int width, const int size, + const Dtype alpha_over_size, const Dtype k, Dtype* const scale) { + std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); + cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); + ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); + ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void LRNFillScale(const int nthreads, const float* const in, - const int num, const int channels, const int height, - const int width, const int size, const float alpha_over_size, - const float k, float* const scale); + const int num, const int channels, const int height, const int width, + const int size, const float alpha_over_size, const float k, + float* const scale); template void LRNFillScale(const int nthreads, const double* const in, - const int num, const int channels, const int height, - const int width, const int size, const double alpha_over_size, - const double k, double* const scale); - -template -void LRNComputeOutput(int nthreads, const Dtype* in, - Dtype* scale, Dtype negative_beta, Dtype* out) { - std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); - cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); - ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); - ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); - ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size2[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, - uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); + const int num, const int channels, const int height, const int width, + const int size, const double alpha_over_size, const double k, + double* const scale); + +template +void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale, + Dtype negative_beta, Dtype* out) { + std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); + cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); } template void LRNComputeOutput(int nthreads, const float* in, - float* scale, float negative_beta, float* out); + float* scale, float negative_beta, float* out); template void LRNComputeOutput(int nthreads, const double* in, - double* scale, double negative_beta, double* out); - -template -void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff) { - std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); - cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); - ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); - ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); - ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); - ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); - ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + double* scale, double negative_beta, double* out); + +template +void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data, + const Dtype* const top_data, const Dtype* const scale, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int size, + const Dtype negative_beta, const Dtype cache_ratio, + Dtype* const bottom_diff) { + std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); + cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); + ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); } template void LRNComputeDiff(const int nthreads, - const float* const bottom_data, const float* const top_data, - const float* const scale, const float* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const float negative_beta, - const float cache_ratio, float* const bottom_diff); + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, const int num, + const int channels, const int height, const int width, const int size, + const float negative_beta, const float cache_ratio, + float* const bottom_diff); template void LRNComputeDiff(const int nthreads, - const double* const bottom_data, const double* const top_data, - const double* const scale, const double* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const double negative_beta, - const double cache_ratio, double* const bottom_diff); + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, const int num, + const int channels, const int height, const int width, const int size, + const double negative_beta, const double cache_ratio, + double* const bottom_diff); template void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { - std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_add(const int n, const float* in1, - const float* in2, float* y); + const float* in2, float* y); template void caffe_gpu_add(const int n, const double* in1, - const double* in2, double* y); + const double* in2, double* y); template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { - std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) N }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); template void caffe_gpu_sign_ocl(const int N, const double* X, - double* Y); + double* Y); template void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { - std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) N }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y); template void caffe_gpu_abs_ocl(const int N, const double* X, - double* Y); + double* Y); template void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) { - std::string kernel_name = "div" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_div(const int n, const float* a, const float* b, - float* y); + float* y); template void caffe_gpu_div(const int n, const double* a, - const double* b, double* y); + const double* b, double* y); template void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) { - std::string kernel_name = "add_scalar" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_add_scalar(const int n, const float alpha, - float* top_data); + float* top_data); template void caffe_gpu_add_scalar(const int n, const double alpha, - double* top_data); + double* top_data); template void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) { - std::string kernel_name = "element_mul" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "element_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_mul(const int n, const float* a, const float* b, - float* y); + float* y); template void caffe_gpu_mul(const int n, const double* a, - const double* b, double* y); + const double* b, double* y); template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { - std::string kernel_name = "powx" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); - ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); - OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) n }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + std::string kernel_name = "powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void caffe_gpu_powx(const int n, const float* a, - const float alpha, float* y); + const float alpha, float* y); template void caffe_gpu_powx(const int n, const double* a, - const double alpha, double* y); + const double alpha, double* y); template void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype* top_data) { - std::string kernel_name = "DropoutForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); + const int* MaskMem, const Dtype scale_, Dtype* top_data) { + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void DropoutForward(const int count, const float* bottom_data, - const int* MaskMem, const float scale_, float* top_data); + const int* MaskMem, const float scale_, float* top_data); template void DropoutForward(const int count, const double* bottom_data, - const int* MaskMem, const double scale_, double* top_data); + const int* MaskMem, const double scale_, double* top_data); template void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff) { - std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); - ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const float threshold_, const Dtype scale_, Dtype* bottom_diff) { + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void DropoutBackward(const int count, const float* top_diff, - const int* MaskMem, const float threshold_, const float scale_, - float* bottom_diff); + const int* MaskMem, const float threshold_, const float scale_, + float* bottom_diff); template void DropoutBackward(const int count, const double* top_diff, - const int* MaskMem, const float threshold_, const double scale_, - double* bottom_diff); + const int* MaskMem, const float threshold_, const double scale_, + double* bottom_diff); template void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) { - std::string kernel_name = "BNLLForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); - OCL_CHECK(ret); + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void BNLLForward(const int count, const float* bottom_data, - float *top_data); + float *top_data); template void BNLLForward(const int count, const double* bottom_data, - double *top_data); + double *top_data); template void BNLLBackward(const int count, const Dtype* top_diff, - const Dtype* bottom_data, Dtype *bottom_diff) { - std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data, Dtype *bottom_diff) { + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void BNLLBackward(const int count, const float* top_diff, - const float* bottom_data, float *bottom_diff); + const float* bottom_data, float *bottom_diff); template void BNLLBackward(const int count, const double* top_diff, - const double* bottom_data, double *bottom_diff); + const double* bottom_data, double *bottom_diff); template void Concat(const int nthreads, const Dtype* in_data, const bool forward, - const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype *out_data) { - std::string kernel_name = "Concat" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - int k_forward = (forward == true) ? 1 : 0; - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) nthreads }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const int num_concats, const int concat_size, const int top_concat_axis, + const int bottom_concat_axis, const int offset_concat_axis, + Dtype *out_data) { + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true) ? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void Concat(const int nthreads, const float* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, float *out_data); + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, float *out_data); template void Concat(const int nthreads, const double* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, double *out_data); - -template -void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) { - std::string kernel_name = "CLLBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version); - ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y); - ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff); - ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq); - ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) count }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, double *out_data); + +template +void CLLBackward(const int count, const int channels, const Dtype margin, + const bool legacy_version, const Dtype alpha, const Dtype* y, + const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff) { + std::string kernel_name = "CLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void CLLBackward(const int count, const int channels, - const float margin, const bool legacy_version, const float alpha, - const float* y, const float* diff, const float* dist_sq, - float *bottom_diff); + const float margin, const bool legacy_version, const float alpha, + const float* y, const float* diff, const float* dist_sq, + float *bottom_diff); template void CLLBackward(const int count, const int channels, - const double margin, const bool legacy_version, const double alpha, - const double* y, const double* diff, const double* dist_sq, - double *bottom_diff); + const double margin, const bool legacy_version, const double alpha, + const double* y, const double* diff, const double* dist_sq, + double *bottom_diff); template void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) { - std::string kernel_name = "MaxForward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); - ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) nthreads }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask) { + std::string kernel_name = "MaxForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxForward(const int nthreads, const float* bottom_data_a, - const float* bottom_data_b, const int blob_idx, float* top_data, - int* mask); + const float* bottom_data_b, const int blob_idx, float* top_data, int* mask); template void MaxForward(const int nthreads, - const double* bottom_data_a, - const double* bottom_data_b, const int blob_idx, double* top_data, - int* mask); - -template -void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) { - std::string kernel_name = "MaxBackward" + get_dtype_suffix(); - cl_kernel kernel = amdDevice.GetKernel(kernel_name); - - cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); - ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); - ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff); - OCL_CHECK(ret); - - size_t Global_Work_Size[] = { (size_t) nthreads }; - size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, - Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + const double* bottom_data_a, const double* bottom_data_b, + const int blob_idx, double* top_data, int* mask); + +template +void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, + const int* mask, Dtype* bottom_diff) { + std::string kernel_name = "MaxBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void MaxBackward(const int nthreads, const float* top_diff, - const int blob_idx, const int* mask, float* bottom_diff); + const int blob_idx, const int* mask, float* bottom_diff); template void MaxBackward(const int nthreads, const double* top_diff, - const int blob_idx, const int* mask, double* bottom_diff); + const int blob_idx, const int* mask, double* bottom_diff); template void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, - int channel_in, int width, int height, int channel_out, int width_out, - int height_out, int kernel_w, int kernel_h, int stride, int pad, - int batch_sz) { + int channel_in, int width, int height, int channel_out, int width_out, + int height_out, int kernel_w, int kernel_h, int stride, int pad, + int batch_sz) { } template void ocl_conv(float* bottom_data, float* top_data, - float* weights, float* bias, int channel_in, int width, int height, - int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, - int stride, int pad, int batch_sz); + float* weights, float* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); template void ocl_conv(double* bottom_data, double* top_data, - double* weights, double* bias, int channel_in, int width, int height, - int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, - int stride, int pad, int batch_sz); + double* weights, double* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); } // namespace caffe diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp index da533cd9..028dd884 100644 --- a/src/caffe/util/upgrade_proto.cpp +++ b/src/caffe/util/upgrade_proto.cpp @@ -13,564 +13,562 @@ namespace caffe { bool NetNeedsUpgrade(const NetParameter& net_param) { - return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param); + return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param); } bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param) { - for (int i = 0; i < net_param.layers_size(); ++i) { - if (net_param.layers(i).has_layer()) { - return true; - } - } - return false; + for (int i = 0; i < net_param.layers_size(); ++i) { + if (net_param.layers(i).has_layer()) { + return true; + } + } + return false; } bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) { - return net_param.layers_size() > 0; + return net_param.layers_size() > 0; } bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, - NetParameter* net_param) { - // First upgrade padding layers to padded conv layers. - NetParameter v0_net_param; - UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); - // Now upgrade layer parameters. - bool is_fully_compatible = true; - net_param->Clear(); - if (v0_net_param.has_name()) { - net_param->set_name(v0_net_param.name()); - } - for (int i = 0; i < v0_net_param.layers_size(); ++i) { - is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), - net_param->add_layers()); - } - for (int i = 0; i < v0_net_param.input_size(); ++i) { - net_param->add_input(v0_net_param.input(i)); - } - for (int i = 0; i < v0_net_param.input_dim_size(); ++i) { - net_param->add_input_dim(v0_net_param.input_dim(i)); - } - if (v0_net_param.has_force_backward()) { - net_param->set_force_backward(v0_net_param.force_backward()); - } - return is_fully_compatible; + NetParameter* net_param) { + // First upgrade padding layers to padded conv layers. + NetParameter v0_net_param; + UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); + // Now upgrade layer parameters. + bool is_fully_compatible = true; + net_param->Clear(); + if (v0_net_param.has_name()) { + net_param->set_name(v0_net_param.name()); + } + for (int i = 0; i < v0_net_param.layers_size(); ++i) { + is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), + net_param->add_layers()); + } + for (int i = 0; i < v0_net_param.input_size(); ++i) { + net_param->add_input(v0_net_param.input(i)); + } + for (int i = 0; i < v0_net_param.input_dim_size(); ++i) { + net_param->add_input_dim(v0_net_param.input_dim(i)); + } + if (v0_net_param.has_force_backward()) { + net_param->set_force_backward(v0_net_param.force_backward()); + } + return is_fully_compatible; } void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad) { - // Copy everything other than the layers from the original param. - param_upgraded_pad->Clear(); - param_upgraded_pad->CopyFrom(param); - param_upgraded_pad->clear_layers(); - // Figure out which layer each bottom blob comes from. - map blob_name_to_last_top_idx; - for (int i = 0; i < param.input_size(); ++i) { - const string& blob_name = param.input(i); - blob_name_to_last_top_idx[blob_name] = -1; - } - for (int i = 0; i < param.layers_size(); ++i) { - const V1LayerParameter& layer_connection = param.layers(i); - const V0LayerParameter& layer_param = layer_connection.layer(); - // Add the layer to the new net, unless it's a padding layer. - if (layer_param.type() != "padding") { - param_upgraded_pad->add_layers()->CopyFrom(layer_connection); - } - for (int j = 0; j < layer_connection.bottom_size(); ++j) { - const string& blob_name = layer_connection.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { - LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; - } - const int top_idx = blob_name_to_last_top_idx[blob_name]; - if (top_idx == -1) { - continue; - } - const V1LayerParameter& source_layer = param.layers(top_idx); - if (source_layer.layer().type() == "padding") { - // This layer has a padding layer as input -- check that it is a conv - // layer or a pooling layer and takes only one input. Also check that - // the padding layer input has only one input and one output. Other - // cases have undefined behavior in Caffe. - CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) - << "Padding layer input to " - "non-convolutional / non-pooling layer type " - << layer_param.type(); - CHECK_EQ(layer_connection.bottom_size(), 1) - << "Conv Layer takes a single blob as input."; - CHECK_EQ(source_layer.bottom_size(), 1) - << "Padding Layer takes a single blob as input."; - CHECK_EQ(source_layer.top_size(), 1) - << "Padding Layer produces a single blob as output."; - int layer_index = param_upgraded_pad->layers_size() - 1; - param_upgraded_pad->mutable_layers(layer_index)->mutable_layer() - ->set_pad(source_layer.layer().pad()); - param_upgraded_pad->mutable_layers(layer_index) - ->set_bottom(j, source_layer.bottom(0)); - } - } - for (int j = 0; j < layer_connection.top_size(); ++j) { - const string& blob_name = layer_connection.top(j); - blob_name_to_last_top_idx[blob_name] = i; - } - } + NetParameter* param_upgraded_pad) { + // Copy everything other than the layers from the original param. + param_upgraded_pad->Clear(); + param_upgraded_pad->CopyFrom(param); + param_upgraded_pad->clear_layers(); + // Figure out which layer each bottom blob comes from. + map blob_name_to_last_top_idx; + for (int i = 0; i < param.input_size(); ++i) { + const string& blob_name = param.input(i); + blob_name_to_last_top_idx[blob_name] = -1; + } + for (int i = 0; i < param.layers_size(); ++i) { + const V1LayerParameter& layer_connection = param.layers(i); + const V0LayerParameter& layer_param = layer_connection.layer(); + // Add the layer to the new net, unless it's a padding layer. + if (layer_param.type() != "padding") { + param_upgraded_pad->add_layers()->CopyFrom(layer_connection); + } + for (int j = 0; j < layer_connection.bottom_size(); ++j) { + const string& blob_name = layer_connection.bottom(j); + if (blob_name_to_last_top_idx.find(blob_name) + == blob_name_to_last_top_idx.end()) { + LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; + } + const int top_idx = blob_name_to_last_top_idx[blob_name]; + if (top_idx == -1) { + continue; + } + const V1LayerParameter& source_layer = param.layers(top_idx); + if (source_layer.layer().type() == "padding") { + // This layer has a padding layer as input -- check that it is a conv + // layer or a pooling layer and takes only one input. Also check that + // the padding layer input has only one input and one output. Other + // cases have undefined behavior in Caffe. + CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) + << "Padding layer input to " + "non-convolutional / non-pooling layer type " + << layer_param.type(); + CHECK_EQ(layer_connection.bottom_size(), 1) + << "Conv Layer takes a single blob as input."; + CHECK_EQ(source_layer.bottom_size(), 1) + << "Padding Layer takes a single blob as input."; + CHECK_EQ(source_layer.top_size(), 1) + << "Padding Layer produces a single blob as output."; + int layer_index = param_upgraded_pad->layers_size() - 1; + param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()->set_pad( + source_layer.layer().pad()); + param_upgraded_pad->mutable_layers(layer_index)->set_bottom(j, + source_layer.bottom(0)); + } + } + for (int j = 0; j < layer_connection.top_size(); ++j) { + const string& blob_name = layer_connection.top(j); + blob_name_to_last_top_idx[blob_name] = i; + } + } } bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param) { - bool is_fully_compatible = true; - layer_param->Clear(); - for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { - layer_param->add_bottom(v0_layer_connection.bottom(i)); - } - for (int i = 0; i < v0_layer_connection.top_size(); ++i) { - layer_param->add_top(v0_layer_connection.top(i)); - } - if (v0_layer_connection.has_layer()) { - const V0LayerParameter& v0_layer_param = v0_layer_connection.layer(); - if (v0_layer_param.has_name()) { - layer_param->set_name(v0_layer_param.name()); - } - const string& type = v0_layer_param.type(); - if (v0_layer_param.has_type()) { - layer_param->set_type(UpgradeV0LayerType(type)); - } - for (int i = 0; i < v0_layer_param.blobs_size(); ++i) { - layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i)); - } - for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) { - layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i)); - } - for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) { - layer_param->add_weight_decay(v0_layer_param.weight_decay(i)); - } - if (v0_layer_param.has_num_output()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_num_output( - v0_layer_param.num_output()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()->set_num_output( - v0_layer_param.num_output()); - } else { - LOG(ERROR) << "Unknown parameter num_output for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_biasterm()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_bias_term( - v0_layer_param.biasterm()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()->set_bias_term( - v0_layer_param.biasterm()); - } else { - LOG(ERROR) << "Unknown parameter biasterm for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_weight_filler()) { - if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); - } else { - LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_bias_filler()) { - if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); - } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); - } else { - LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_pad()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad()); - } else { - LOG(ERROR) << "Unknown parameter pad for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_kernelsize()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_kernel_size( - v0_layer_param.kernelsize()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_kernel_size( - v0_layer_param.kernelsize()); - } else { - LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_group()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_group( - v0_layer_param.group()); - } else { - LOG(ERROR) << "Unknown parameter group for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_stride()) { - if (type == "conv") { - layer_param->mutable_convolution_param()->set_stride( - v0_layer_param.stride()); - } else if (type == "pool") { - layer_param->mutable_pooling_param()->set_stride( - v0_layer_param.stride()); - } else { - LOG(ERROR) << "Unknown parameter stride for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_pool()) { - if (type == "pool") { - V0LayerParameter_PoolMethod pool = v0_layer_param.pool(); - switch (pool) { - case V0LayerParameter_PoolMethod_MAX: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_MAX); - break; - case V0LayerParameter_PoolMethod_AVE: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_AVE); - break; - case V0LayerParameter_PoolMethod_STOCHASTIC: - layer_param->mutable_pooling_param()->set_pool( - PoolingParameter_PoolMethod_STOCHASTIC); - break; - default: - LOG(ERROR) << "Unknown pool method " << pool; - is_fully_compatible = false; - } - } else { - LOG(ERROR) << "Unknown parameter pool for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_dropout_ratio()) { - if (type == "dropout") { - layer_param->mutable_dropout_param()->set_dropout_ratio( - v0_layer_param.dropout_ratio()); - } else { - LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_local_size()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_local_size( - v0_layer_param.local_size()); - } else { - LOG(ERROR) << "Unknown parameter local_size for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_alpha()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha()); - } else { - LOG(ERROR) << "Unknown parameter alpha for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_beta()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta()); - } else { - LOG(ERROR) << "Unknown parameter beta for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_k()) { - if (type == "lrn") { - layer_param->mutable_lrn_param()->set_k(v0_layer_param.k()); - } else { - LOG(ERROR) << "Unknown parameter k for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_source()) { - if (type == "data") { - layer_param->mutable_data_param()->set_source(v0_layer_param.source()); - } else if (type == "hdf5_data") { - layer_param->mutable_hdf5_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "window_data") { - layer_param->mutable_window_data_param()->set_source( - v0_layer_param.source()); - } else if (type == "infogain_loss") { - layer_param->mutable_infogain_loss_param()->set_source( - v0_layer_param.source()); - } else { - LOG(ERROR) << "Unknown parameter source for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_scale()) { - layer_param->mutable_transform_param()-> - set_scale(v0_layer_param.scale()); - } - if (v0_layer_param.has_meanfile()) { - layer_param->mutable_transform_param()-> - set_mean_file(v0_layer_param.meanfile()); - } - if (v0_layer_param.has_batchsize()) { - if (type == "data") { - layer_param->mutable_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "hdf5_data") { - layer_param->mutable_hdf5_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else if (type == "window_data") { - layer_param->mutable_window_data_param()->set_batch_size( - v0_layer_param.batchsize()); - } else { - LOG(ERROR) << "Unknown parameter batchsize for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_cropsize()) { - layer_param->mutable_transform_param()-> - set_crop_size(v0_layer_param.cropsize()); - } - if (v0_layer_param.has_mirror()) { - layer_param->mutable_transform_param()-> - set_mirror(v0_layer_param.mirror()); - } - if (v0_layer_param.has_rand_skip()) { - if (type == "data") { - layer_param->mutable_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); - } else if (type == "images") { - layer_param->mutable_image_data_param()->set_rand_skip( - v0_layer_param.rand_skip()); - } else { - LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_shuffle_images()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_shuffle( - v0_layer_param.shuffle_images()); - } else { - LOG(ERROR) << "Unknown parameter shuffle for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_new_height()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_new_height( - v0_layer_param.new_height()); - } else { - LOG(ERROR) << "Unknown parameter new_height for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_new_width()) { - if (type == "images") { - layer_param->mutable_image_data_param()->set_new_width( - v0_layer_param.new_width()); - } else { - LOG(ERROR) << "Unknown parameter new_width for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_concat_dim()) { - if (type == "concat") { - layer_param->mutable_concat_param()->set_concat_dim( - v0_layer_param.concat_dim()); - } else { - LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_fg_threshold()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_fg_threshold( - v0_layer_param.det_fg_threshold()); - } else { - LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_bg_threshold()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_bg_threshold( - v0_layer_param.det_bg_threshold()); - } else { - LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_fg_fraction()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_fg_fraction( - v0_layer_param.det_fg_fraction()); - } else { - LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_context_pad()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_context_pad( - v0_layer_param.det_context_pad()); - } else { - LOG(ERROR) << "Unknown parameter det_context_pad for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_det_crop_mode()) { - if (type == "window_data") { - layer_param->mutable_window_data_param()->set_crop_mode( - v0_layer_param.det_crop_mode()); - } else { - LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " - << type; - is_fully_compatible = false; - } - } - if (v0_layer_param.has_hdf5_output_param()) { - if (type == "hdf5_output") { - layer_param->mutable_hdf5_output_param()->CopyFrom( - v0_layer_param.hdf5_output_param()); - } else { - LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " - << type; - is_fully_compatible = false; - } - } - } - return is_fully_compatible; + V1LayerParameter* layer_param) { + bool is_fully_compatible = true; + layer_param->Clear(); + for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { + layer_param->add_bottom(v0_layer_connection.bottom(i)); + } + for (int i = 0; i < v0_layer_connection.top_size(); ++i) { + layer_param->add_top(v0_layer_connection.top(i)); + } + if (v0_layer_connection.has_layer()) { + const V0LayerParameter& v0_layer_param = v0_layer_connection.layer(); + if (v0_layer_param.has_name()) { + layer_param->set_name(v0_layer_param.name()); + } + const string& type = v0_layer_param.type(); + if (v0_layer_param.has_type()) { + layer_param->set_type(UpgradeV0LayerType(type)); + } + for (int i = 0; i < v0_layer_param.blobs_size(); ++i) { + layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i)); + } + for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) { + layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i)); + } + for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) { + layer_param->add_weight_decay(v0_layer_param.weight_decay(i)); + } + if (v0_layer_param.has_num_output()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_num_output( + v0_layer_param.num_output()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->set_num_output( + v0_layer_param.num_output()); + } else { + LOG(ERROR) << "Unknown parameter num_output for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_biasterm()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_bias_term( + v0_layer_param.biasterm()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->set_bias_term( + v0_layer_param.biasterm()); + } else { + LOG(ERROR) << "Unknown parameter biasterm for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_weight_filler()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->mutable_weight_filler()->CopyFrom( + v0_layer_param.weight_filler()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->mutable_weight_filler()->CopyFrom( + v0_layer_param.weight_filler()); + } else { + LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_bias_filler()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->mutable_bias_filler()->CopyFrom( + v0_layer_param.bias_filler()); + } else if (type == "innerproduct") { + layer_param->mutable_inner_product_param()->mutable_bias_filler()->CopyFrom( + v0_layer_param.bias_filler()); + } else { + LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_pad()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad()); + } else { + LOG(ERROR) << "Unknown parameter pad for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_kernelsize()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_kernel_size( + v0_layer_param.kernelsize()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_kernel_size( + v0_layer_param.kernelsize()); + } else { + LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_group()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_group( + v0_layer_param.group()); + } else { + LOG(ERROR) << "Unknown parameter group for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_stride()) { + if (type == "conv") { + layer_param->mutable_convolution_param()->set_stride( + v0_layer_param.stride()); + } else if (type == "pool") { + layer_param->mutable_pooling_param()->set_stride( + v0_layer_param.stride()); + } else { + LOG(ERROR) << "Unknown parameter stride for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_pool()) { + if (type == "pool") { + V0LayerParameter_PoolMethod pool = v0_layer_param.pool(); + switch (pool) { + case V0LayerParameter_PoolMethod_MAX: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_MAX); + break; + case V0LayerParameter_PoolMethod_AVE: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + break; + case V0LayerParameter_PoolMethod_STOCHASTIC: + layer_param->mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_STOCHASTIC); + break; + default: + LOG(ERROR) << "Unknown pool method " << pool; + is_fully_compatible = false; + } + } else { + LOG(ERROR) << "Unknown parameter pool for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_dropout_ratio()) { + if (type == "dropout") { + layer_param->mutable_dropout_param()->set_dropout_ratio( + v0_layer_param.dropout_ratio()); + } else { + LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_local_size()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_local_size( + v0_layer_param.local_size()); + } else { + LOG(ERROR) << "Unknown parameter local_size for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_alpha()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha()); + } else { + LOG(ERROR) << "Unknown parameter alpha for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_beta()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta()); + } else { + LOG(ERROR) << "Unknown parameter beta for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_k()) { + if (type == "lrn") { + layer_param->mutable_lrn_param()->set_k(v0_layer_param.k()); + } else { + LOG(ERROR) << "Unknown parameter k for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_source()) { + if (type == "data") { + layer_param->mutable_data_param()->set_source(v0_layer_param.source()); + } else if (type == "hdf5_data") { + layer_param->mutable_hdf5_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "window_data") { + layer_param->mutable_window_data_param()->set_source( + v0_layer_param.source()); + } else if (type == "infogain_loss") { + layer_param->mutable_infogain_loss_param()->set_source( + v0_layer_param.source()); + } else { + LOG(ERROR) << "Unknown parameter source for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_scale()) { + layer_param->mutable_transform_param()->set_scale(v0_layer_param.scale()); + } + if (v0_layer_param.has_meanfile()) { + layer_param->mutable_transform_param()->set_mean_file( + v0_layer_param.meanfile()); + } + if (v0_layer_param.has_batchsize()) { + if (type == "data") { + layer_param->mutable_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "hdf5_data") { + layer_param->mutable_hdf5_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else if (type == "window_data") { + layer_param->mutable_window_data_param()->set_batch_size( + v0_layer_param.batchsize()); + } else { + LOG(ERROR) << "Unknown parameter batchsize for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_cropsize()) { + layer_param->mutable_transform_param()->set_crop_size( + v0_layer_param.cropsize()); + } + if (v0_layer_param.has_mirror()) { + layer_param->mutable_transform_param()->set_mirror( + v0_layer_param.mirror()); + } + if (v0_layer_param.has_rand_skip()) { + if (type == "data") { + layer_param->mutable_data_param()->set_rand_skip( + v0_layer_param.rand_skip()); + } else if (type == "images") { + layer_param->mutable_image_data_param()->set_rand_skip( + v0_layer_param.rand_skip()); + } else { + LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_shuffle_images()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_shuffle( + v0_layer_param.shuffle_images()); + } else { + LOG(ERROR) << "Unknown parameter shuffle for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_new_height()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_new_height( + v0_layer_param.new_height()); + } else { + LOG(ERROR) << "Unknown parameter new_height for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_new_width()) { + if (type == "images") { + layer_param->mutable_image_data_param()->set_new_width( + v0_layer_param.new_width()); + } else { + LOG(ERROR) << "Unknown parameter new_width for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_concat_dim()) { + if (type == "concat") { + layer_param->mutable_concat_param()->set_concat_dim( + v0_layer_param.concat_dim()); + } else { + LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_fg_threshold()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_fg_threshold( + v0_layer_param.det_fg_threshold()); + } else { + LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_bg_threshold()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_bg_threshold( + v0_layer_param.det_bg_threshold()); + } else { + LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_fg_fraction()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_fg_fraction( + v0_layer_param.det_fg_fraction()); + } else { + LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_context_pad()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_context_pad( + v0_layer_param.det_context_pad()); + } else { + LOG(ERROR) << "Unknown parameter det_context_pad for layer type " + << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_det_crop_mode()) { + if (type == "window_data") { + layer_param->mutable_window_data_param()->set_crop_mode( + v0_layer_param.det_crop_mode()); + } else { + LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " << type; + is_fully_compatible = false; + } + } + if (v0_layer_param.has_hdf5_output_param()) { + if (type == "hdf5_output") { + layer_param->mutable_hdf5_output_param()->CopyFrom( + v0_layer_param.hdf5_output_param()); + } else { + LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " + << type; + is_fully_compatible = false; + } + } + } + return is_fully_compatible; } V1LayerParameter_LayerType UpgradeV0LayerType(const string& type) { - if (type == "accuracy") { - return V1LayerParameter_LayerType_ACCURACY; - } else if (type == "bnll") { - return V1LayerParameter_LayerType_BNLL; - } else if (type == "concat") { - return V1LayerParameter_LayerType_CONCAT; - } else if (type == "conv") { - return V1LayerParameter_LayerType_CONVOLUTION; - } else if (type == "data") { - return V1LayerParameter_LayerType_DATA; - } else if (type == "dropout") { - return V1LayerParameter_LayerType_DROPOUT; - } else if (type == "euclidean_loss") { - return V1LayerParameter_LayerType_EUCLIDEAN_LOSS; - } else if (type == "flatten") { - return V1LayerParameter_LayerType_FLATTEN; - } else if (type == "hdf5_data") { - return V1LayerParameter_LayerType_HDF5_DATA; - } else if (type == "hdf5_output") { - return V1LayerParameter_LayerType_HDF5_OUTPUT; - } else if (type == "im2col") { - return V1LayerParameter_LayerType_IM2COL; - } else if (type == "images") { - return V1LayerParameter_LayerType_IMAGE_DATA; - } else if (type == "infogain_loss") { - return V1LayerParameter_LayerType_INFOGAIN_LOSS; - } else if (type == "innerproduct") { - return V1LayerParameter_LayerType_INNER_PRODUCT; - } else if (type == "lrn") { - return V1LayerParameter_LayerType_LRN; - } else if (type == "multinomial_logistic_loss") { - return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; - } else if (type == "pool") { - return V1LayerParameter_LayerType_POOLING; - } else if (type == "relu") { - return V1LayerParameter_LayerType_RELU; - } else if (type == "sigmoid") { - return V1LayerParameter_LayerType_SIGMOID; - } else if (type == "softmax") { - return V1LayerParameter_LayerType_SOFTMAX; - } else if (type == "softmax_loss") { - return V1LayerParameter_LayerType_SOFTMAX_LOSS; - } else if (type == "split") { - return V1LayerParameter_LayerType_SPLIT; - } else if (type == "tanh") { - return V1LayerParameter_LayerType_TANH; - } else if (type == "window_data") { - return V1LayerParameter_LayerType_WINDOW_DATA; - } else { - LOG(FATAL) << "Unknown layer name: " << type; - return V1LayerParameter_LayerType_NONE; - } + if (type == "accuracy") { + return V1LayerParameter_LayerType_ACCURACY; + } else if (type == "bnll") { + return V1LayerParameter_LayerType_BNLL; + } else if (type == "concat") { + return V1LayerParameter_LayerType_CONCAT; + } else if (type == "conv") { + return V1LayerParameter_LayerType_CONVOLUTION; + } else if (type == "data") { + return V1LayerParameter_LayerType_DATA; + } else if (type == "dropout") { + return V1LayerParameter_LayerType_DROPOUT; + } else if (type == "euclidean_loss") { + return V1LayerParameter_LayerType_EUCLIDEAN_LOSS; + } else if (type == "flatten") { + return V1LayerParameter_LayerType_FLATTEN; + } else if (type == "hdf5_data") { + return V1LayerParameter_LayerType_HDF5_DATA; + } else if (type == "hdf5_output") { + return V1LayerParameter_LayerType_HDF5_OUTPUT; + } else if (type == "im2col") { + return V1LayerParameter_LayerType_IM2COL; + } else if (type == "images") { + return V1LayerParameter_LayerType_IMAGE_DATA; + } else if (type == "infogain_loss") { + return V1LayerParameter_LayerType_INFOGAIN_LOSS; + } else if (type == "innerproduct") { + return V1LayerParameter_LayerType_INNER_PRODUCT; + } else if (type == "lrn") { + return V1LayerParameter_LayerType_LRN; + } else if (type == "multinomial_logistic_loss") { + return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; + } else if (type == "pool") { + return V1LayerParameter_LayerType_POOLING; + } else if (type == "relu") { + return V1LayerParameter_LayerType_RELU; + } else if (type == "sigmoid") { + return V1LayerParameter_LayerType_SIGMOID; + } else if (type == "softmax") { + return V1LayerParameter_LayerType_SOFTMAX; + } else if (type == "softmax_loss") { + return V1LayerParameter_LayerType_SOFTMAX_LOSS; + } else if (type == "split") { + return V1LayerParameter_LayerType_SPLIT; + } else if (type == "tanh") { + return V1LayerParameter_LayerType_TANH; + } else if (type == "window_data") { + return V1LayerParameter_LayerType_WINDOW_DATA; + } else { + LOG(FATAL) << "Unknown layer name: " << type; + return V1LayerParameter_LayerType_NONE; + } } bool NetNeedsDataUpgrade(const NetParameter& net_param) { - for (int i = 0; i < net_param.layers_size(); ++i) { - if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) { - DataParameter layer_param = net_param.layers(i).data_param(); - if (layer_param.has_scale()) { - return true; - } - if (layer_param.has_mean_file()) { - return true; - } - if (layer_param.has_crop_size()) { - return true; - } - if (layer_param.has_mirror()) { - return true; - } - } - if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) { - ImageDataParameter layer_param = net_param.layers(i).image_data_param(); - if (layer_param.has_scale()) { - return true; - } - if (layer_param.has_mean_file()) { - return true; - } - if (layer_param.has_crop_size()) { - return true; - } - if (layer_param.has_mirror()) { - return true; - } - } - if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) { - WindowDataParameter layer_param = net_param.layers(i).window_data_param(); - if (layer_param.has_scale()) { - return true; - } - if (layer_param.has_mean_file()) { - return true; - } - if (layer_param.has_crop_size()) { - return true; - } - if (layer_param.has_mirror()) { - return true; - } - } - } - return false; + for (int i = 0; i < net_param.layers_size(); ++i) { + if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) { + DataParameter layer_param = net_param.layers(i).data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) { + ImageDataParameter layer_param = net_param.layers(i).image_data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) { + WindowDataParameter layer_param = net_param.layers(i).window_data_param(); + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } + } + } + return false; } #define CONVERT_LAYER_TRANSFORM_PARAM(TYPE, Name, param_name) \ @@ -600,373 +598,364 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) { } while (0) void UpgradeNetDataTransformation(NetParameter* net_param) { - for (int i = 0; i < net_param->layers_size(); ++i) { - CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data); - CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data); - CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data); - } + for (int i = 0; i < net_param->layers_size(); ++i) { + CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data); + CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data); + CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data); + } } bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { - bool success = true; - if (NetNeedsV0ToV1Upgrade(*param)) { - // NetParameter was specified using the old style (V0LayerParameter); try to - // upgrade it. - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V0LayerParameter: " << param_file; - NetParameter original_param(*param); - if (!UpgradeV0Net(original_param, param)) { - success = false; - LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V0NetParameter to NetParameter (see above); continuing anyway."; - } else { - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V0LayerParameter"; - } - LOG(ERROR) << "Note that future Caffe releases will not support " - << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " - << "prototxt and ./build/tools/upgrade_net_proto_binary for model " - << "weights upgrade this and any other net protos to the new format."; - } - // NetParameter uses old style data transformation fields; try to upgrade it. - if (NetNeedsDataUpgrade(*param)) { - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "transformation parameters: " << param_file; - UpgradeNetDataTransformation(param); - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "data transformation parameters."; - LOG(ERROR) << "Note that future Caffe releases will only support " - << "transform_param messages for transformation fields."; - } - if (NetNeedsV1ToV2Upgrade(*param)) { - LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V1LayerParameter: " << param_file; - NetParameter original_param(*param); - if (!UpgradeV1Net(original_param, param)) { - success = false; - LOG(ERROR) << "Warning: had one or more problems upgrading " - << "V1LayerParameter (see above); continuing anyway."; - } else { - LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V1LayerParameter"; - } - } - return success; + bool success = true; + if (NetNeedsV0ToV1Upgrade(*param)) { + // NetParameter was specified using the old style (V0LayerParameter); try to + // upgrade it. + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "V0LayerParameter: " << param_file; + NetParameter original_param(*param); + if (!UpgradeV0Net(original_param, param)) { + success = false; + LOG(ERROR) << "Warning: had one or more problems upgrading " + << "V0NetParameter to NetParameter (see above); continuing anyway."; + } else { + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "V0LayerParameter"; + } + LOG(ERROR) << "Note that future Caffe releases will not support " + << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " + << "prototxt and ./build/tools/upgrade_net_proto_binary for model " + << "weights upgrade this and any other net protos to the new format."; + } + // NetParameter uses old style data transformation fields; try to upgrade it. + if (NetNeedsDataUpgrade(*param)) { + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "transformation parameters: " << param_file; + UpgradeNetDataTransformation(param); + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "data transformation parameters."; + LOG(ERROR) << "Note that future Caffe releases will only support " + << "transform_param messages for transformation fields."; + } + if (NetNeedsV1ToV2Upgrade(*param)) { + LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " + << "V1LayerParameter: " << param_file; + NetParameter original_param(*param); + if (!UpgradeV1Net(original_param, param)) { + success = false; + LOG(ERROR) << "Warning: had one or more problems upgrading " + << "V1LayerParameter (see above); continuing anyway."; + } else { + LOG(INFO) << "Successfully upgraded file specified using deprecated " + << "V1LayerParameter"; + } + } + return success; } bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { - bool is_fully_compatible = true; - if (v1_net_param.layer_size() > 0) { - LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " - << "fields; these will be ignored for the upgrade."; - is_fully_compatible = false; - } - net_param->CopyFrom(v1_net_param); - net_param->clear_layers(); - net_param->clear_layer(); - for (int i = 0; i < v1_net_param.layers_size(); ++i) { - if (!UpgradeV1LayerParameter(v1_net_param.layers(i), - net_param->add_layer())) { - LOG(ERROR) << "Upgrade of input layer " << i << " failed."; - is_fully_compatible = false; - } - } - return is_fully_compatible; + bool is_fully_compatible = true; + if (v1_net_param.layer_size() > 0) { + LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " + << "fields; these will be ignored for the upgrade."; + is_fully_compatible = false; + } + net_param->CopyFrom(v1_net_param); + net_param->clear_layers(); + net_param->clear_layer(); + for (int i = 0; i < v1_net_param.layers_size(); ++i) { + if (!UpgradeV1LayerParameter(v1_net_param.layers(i), + net_param->add_layer())) { + LOG(ERROR) << "Upgrade of input layer " << i << " failed."; + is_fully_compatible = false; + } + } + return is_fully_compatible; } bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param) { - layer_param->Clear(); - bool is_fully_compatible = true; - for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { - layer_param->add_bottom(v1_layer_param.bottom(i)); - } - for (int i = 0; i < v1_layer_param.top_size(); ++i) { - layer_param->add_top(v1_layer_param.top(i)); - } - if (v1_layer_param.has_name()) { - layer_param->set_name(v1_layer_param.name()); - } - for (int i = 0; i < v1_layer_param.include_size(); ++i) { - layer_param->add_include()->CopyFrom(v1_layer_param.include(i)); - } - for (int i = 0; i < v1_layer_param.exclude_size(); ++i) { - layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i)); - } - if (v1_layer_param.has_type()) { - layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type())); - } - for (int i = 0; i < v1_layer_param.blobs_size(); ++i) { - layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i)); - } - for (int i = 0; i < v1_layer_param.param_size(); ++i) { - while (layer_param->param_size() <= i) { - layer_param->add_param(); - } - layer_param->mutable_param(i)->set_name(v1_layer_param.param(i)); - } - ParamSpec_DimCheckMode mode; - for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) { - while (layer_param->param_size() <= i) { - layer_param->add_param(); - } - switch (v1_layer_param.blob_share_mode(i)) { - case V1LayerParameter_DimCheckMode_STRICT: - mode = ParamSpec_DimCheckMode_STRICT; - break; - case V1LayerParameter_DimCheckMode_PERMISSIVE: - mode = ParamSpec_DimCheckMode_PERMISSIVE; - break; - default: - LOG(FATAL) << "Unknown blob_share_mode: " - << v1_layer_param.blob_share_mode(i); - break; - } - layer_param->mutable_param(i)->set_share_mode(mode); - } - for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) { - while (layer_param->param_size() <= i) { - layer_param->add_param(); - } - layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i)); - } - for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) { - while (layer_param->param_size() <= i) { - layer_param->add_param(); - } - layer_param->mutable_param(i)->set_decay_mult( - v1_layer_param.weight_decay(i)); - } - for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) { - layer_param->add_loss_weight(v1_layer_param.loss_weight(i)); - } - if (v1_layer_param.has_accuracy_param()) { - layer_param->mutable_accuracy_param()->CopyFrom( - v1_layer_param.accuracy_param()); - } - if (v1_layer_param.has_argmax_param()) { - layer_param->mutable_argmax_param()->CopyFrom( - v1_layer_param.argmax_param()); - } - if (v1_layer_param.has_concat_param()) { - layer_param->mutable_concat_param()->CopyFrom( - v1_layer_param.concat_param()); - } - if (v1_layer_param.has_contrastive_loss_param()) { - layer_param->mutable_contrastive_loss_param()->CopyFrom( - v1_layer_param.contrastive_loss_param()); - } - if (v1_layer_param.has_convolution_param()) { - layer_param->mutable_convolution_param()->CopyFrom( - v1_layer_param.convolution_param()); - } - if (v1_layer_param.has_data_param()) { - layer_param->mutable_data_param()->CopyFrom( - v1_layer_param.data_param()); - } - if (v1_layer_param.has_dropout_param()) { - layer_param->mutable_dropout_param()->CopyFrom( - v1_layer_param.dropout_param()); - } - if (v1_layer_param.has_dummy_data_param()) { - layer_param->mutable_dummy_data_param()->CopyFrom( - v1_layer_param.dummy_data_param()); - } - if (v1_layer_param.has_eltwise_param()) { - layer_param->mutable_eltwise_param()->CopyFrom( - v1_layer_param.eltwise_param()); - } - if (v1_layer_param.has_exp_param()) { - layer_param->mutable_exp_param()->CopyFrom( - v1_layer_param.exp_param()); - } - if (v1_layer_param.has_hdf5_data_param()) { - layer_param->mutable_hdf5_data_param()->CopyFrom( - v1_layer_param.hdf5_data_param()); - } - if (v1_layer_param.has_hdf5_output_param()) { - layer_param->mutable_hdf5_output_param()->CopyFrom( - v1_layer_param.hdf5_output_param()); - } - if (v1_layer_param.has_hinge_loss_param()) { - layer_param->mutable_hinge_loss_param()->CopyFrom( - v1_layer_param.hinge_loss_param()); - } - if (v1_layer_param.has_image_data_param()) { - layer_param->mutable_image_data_param()->CopyFrom( - v1_layer_param.image_data_param()); - } - if (v1_layer_param.has_infogain_loss_param()) { - layer_param->mutable_infogain_loss_param()->CopyFrom( - v1_layer_param.infogain_loss_param()); - } - if (v1_layer_param.has_inner_product_param()) { - layer_param->mutable_inner_product_param()->CopyFrom( - v1_layer_param.inner_product_param()); - } - if (v1_layer_param.has_lrn_param()) { - layer_param->mutable_lrn_param()->CopyFrom( - v1_layer_param.lrn_param()); - } - if (v1_layer_param.has_memory_data_param()) { - layer_param->mutable_memory_data_param()->CopyFrom( - v1_layer_param.memory_data_param()); - } - if (v1_layer_param.has_mvn_param()) { - layer_param->mutable_mvn_param()->CopyFrom( - v1_layer_param.mvn_param()); - } - if (v1_layer_param.has_pooling_param()) { - layer_param->mutable_pooling_param()->CopyFrom( - v1_layer_param.pooling_param()); - } - if (v1_layer_param.has_power_param()) { - layer_param->mutable_power_param()->CopyFrom( - v1_layer_param.power_param()); - } - if (v1_layer_param.has_relu_param()) { - layer_param->mutable_relu_param()->CopyFrom( - v1_layer_param.relu_param()); - } - if (v1_layer_param.has_sigmoid_param()) { - layer_param->mutable_sigmoid_param()->CopyFrom( - v1_layer_param.sigmoid_param()); - } - if (v1_layer_param.has_softmax_param()) { - layer_param->mutable_softmax_param()->CopyFrom( - v1_layer_param.softmax_param()); - } - if (v1_layer_param.has_slice_param()) { - layer_param->mutable_slice_param()->CopyFrom( - v1_layer_param.slice_param()); - } - if (v1_layer_param.has_tanh_param()) { - layer_param->mutable_tanh_param()->CopyFrom( - v1_layer_param.tanh_param()); - } - if (v1_layer_param.has_threshold_param()) { - layer_param->mutable_threshold_param()->CopyFrom( - v1_layer_param.threshold_param()); - } - if (v1_layer_param.has_window_data_param()) { - layer_param->mutable_window_data_param()->CopyFrom( - v1_layer_param.window_data_param()); - } - if (v1_layer_param.has_transform_param()) { - layer_param->mutable_transform_param()->CopyFrom( - v1_layer_param.transform_param()); - } - if (v1_layer_param.has_loss_param()) { - layer_param->mutable_loss_param()->CopyFrom( - v1_layer_param.loss_param()); - } - if (v1_layer_param.has_layer()) { - LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; - is_fully_compatible = false; - } - return is_fully_compatible; + LayerParameter* layer_param) { + layer_param->Clear(); + bool is_fully_compatible = true; + for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { + layer_param->add_bottom(v1_layer_param.bottom(i)); + } + for (int i = 0; i < v1_layer_param.top_size(); ++i) { + layer_param->add_top(v1_layer_param.top(i)); + } + if (v1_layer_param.has_name()) { + layer_param->set_name(v1_layer_param.name()); + } + for (int i = 0; i < v1_layer_param.include_size(); ++i) { + layer_param->add_include()->CopyFrom(v1_layer_param.include(i)); + } + for (int i = 0; i < v1_layer_param.exclude_size(); ++i) { + layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i)); + } + if (v1_layer_param.has_type()) { + layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type())); + } + for (int i = 0; i < v1_layer_param.blobs_size(); ++i) { + layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i)); + } + for (int i = 0; i < v1_layer_param.param_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_name(v1_layer_param.param(i)); + } + ParamSpec_DimCheckMode mode; + for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + switch (v1_layer_param.blob_share_mode(i)) { + case V1LayerParameter_DimCheckMode_STRICT: + mode = ParamSpec_DimCheckMode_STRICT; + break; + case V1LayerParameter_DimCheckMode_PERMISSIVE: + mode = ParamSpec_DimCheckMode_PERMISSIVE; + break; + default: + LOG(FATAL) << "Unknown blob_share_mode: " + << v1_layer_param.blob_share_mode(i); + break; + } + layer_param->mutable_param(i)->set_share_mode(mode); + } + for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i)); + } + for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) { + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } + layer_param->mutable_param(i)->set_decay_mult( + v1_layer_param.weight_decay(i)); + } + for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) { + layer_param->add_loss_weight(v1_layer_param.loss_weight(i)); + } + if (v1_layer_param.has_accuracy_param()) { + layer_param->mutable_accuracy_param()->CopyFrom( + v1_layer_param.accuracy_param()); + } + if (v1_layer_param.has_argmax_param()) { + layer_param->mutable_argmax_param()->CopyFrom( + v1_layer_param.argmax_param()); + } + if (v1_layer_param.has_concat_param()) { + layer_param->mutable_concat_param()->CopyFrom( + v1_layer_param.concat_param()); + } + if (v1_layer_param.has_contrastive_loss_param()) { + layer_param->mutable_contrastive_loss_param()->CopyFrom( + v1_layer_param.contrastive_loss_param()); + } + if (v1_layer_param.has_convolution_param()) { + layer_param->mutable_convolution_param()->CopyFrom( + v1_layer_param.convolution_param()); + } + if (v1_layer_param.has_data_param()) { + layer_param->mutable_data_param()->CopyFrom(v1_layer_param.data_param()); + } + if (v1_layer_param.has_dropout_param()) { + layer_param->mutable_dropout_param()->CopyFrom( + v1_layer_param.dropout_param()); + } + if (v1_layer_param.has_dummy_data_param()) { + layer_param->mutable_dummy_data_param()->CopyFrom( + v1_layer_param.dummy_data_param()); + } + if (v1_layer_param.has_eltwise_param()) { + layer_param->mutable_eltwise_param()->CopyFrom( + v1_layer_param.eltwise_param()); + } + if (v1_layer_param.has_exp_param()) { + layer_param->mutable_exp_param()->CopyFrom(v1_layer_param.exp_param()); + } + if (v1_layer_param.has_hdf5_data_param()) { + layer_param->mutable_hdf5_data_param()->CopyFrom( + v1_layer_param.hdf5_data_param()); + } + if (v1_layer_param.has_hdf5_output_param()) { + layer_param->mutable_hdf5_output_param()->CopyFrom( + v1_layer_param.hdf5_output_param()); + } + if (v1_layer_param.has_hinge_loss_param()) { + layer_param->mutable_hinge_loss_param()->CopyFrom( + v1_layer_param.hinge_loss_param()); + } + if (v1_layer_param.has_image_data_param()) { + layer_param->mutable_image_data_param()->CopyFrom( + v1_layer_param.image_data_param()); + } + if (v1_layer_param.has_infogain_loss_param()) { + layer_param->mutable_infogain_loss_param()->CopyFrom( + v1_layer_param.infogain_loss_param()); + } + if (v1_layer_param.has_inner_product_param()) { + layer_param->mutable_inner_product_param()->CopyFrom( + v1_layer_param.inner_product_param()); + } + if (v1_layer_param.has_lrn_param()) { + layer_param->mutable_lrn_param()->CopyFrom(v1_layer_param.lrn_param()); + } + if (v1_layer_param.has_memory_data_param()) { + layer_param->mutable_memory_data_param()->CopyFrom( + v1_layer_param.memory_data_param()); + } + if (v1_layer_param.has_mvn_param()) { + layer_param->mutable_mvn_param()->CopyFrom(v1_layer_param.mvn_param()); + } + if (v1_layer_param.has_pooling_param()) { + layer_param->mutable_pooling_param()->CopyFrom( + v1_layer_param.pooling_param()); + } + if (v1_layer_param.has_power_param()) { + layer_param->mutable_power_param()->CopyFrom(v1_layer_param.power_param()); + } + if (v1_layer_param.has_relu_param()) { + layer_param->mutable_relu_param()->CopyFrom(v1_layer_param.relu_param()); + } + if (v1_layer_param.has_sigmoid_param()) { + layer_param->mutable_sigmoid_param()->CopyFrom( + v1_layer_param.sigmoid_param()); + } + if (v1_layer_param.has_softmax_param()) { + layer_param->mutable_softmax_param()->CopyFrom( + v1_layer_param.softmax_param()); + } + if (v1_layer_param.has_slice_param()) { + layer_param->mutable_slice_param()->CopyFrom(v1_layer_param.slice_param()); + } + if (v1_layer_param.has_tanh_param()) { + layer_param->mutable_tanh_param()->CopyFrom(v1_layer_param.tanh_param()); + } + if (v1_layer_param.has_threshold_param()) { + layer_param->mutable_threshold_param()->CopyFrom( + v1_layer_param.threshold_param()); + } + if (v1_layer_param.has_window_data_param()) { + layer_param->mutable_window_data_param()->CopyFrom( + v1_layer_param.window_data_param()); + } + if (v1_layer_param.has_transform_param()) { + layer_param->mutable_transform_param()->CopyFrom( + v1_layer_param.transform_param()); + } + if (v1_layer_param.has_loss_param()) { + layer_param->mutable_loss_param()->CopyFrom(v1_layer_param.loss_param()); + } + if (v1_layer_param.has_layer()) { + LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; + is_fully_compatible = false; + } + return is_fully_compatible; } const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) { - switch (type) { - case V1LayerParameter_LayerType_NONE: - return ""; - case V1LayerParameter_LayerType_ABSVAL: - return "AbsVal"; - case V1LayerParameter_LayerType_ACCURACY: - return "Accuracy"; - case V1LayerParameter_LayerType_ARGMAX: - return "ArgMax"; - case V1LayerParameter_LayerType_BNLL: - return "BNLL"; - case V1LayerParameter_LayerType_CONCAT: - return "Concat"; - case V1LayerParameter_LayerType_CONTRASTIVE_LOSS: - return "ContrastiveLoss"; - case V1LayerParameter_LayerType_CONVOLUTION: - return "Convolution"; - case V1LayerParameter_LayerType_DECONVOLUTION: - return "Deconvolution"; - case V1LayerParameter_LayerType_DATA: - return "Data"; - case V1LayerParameter_LayerType_DROPOUT: - return "Dropout"; - case V1LayerParameter_LayerType_DUMMY_DATA: - return "DummyData"; - case V1LayerParameter_LayerType_EUCLIDEAN_LOSS: - return "EuclideanLoss"; - case V1LayerParameter_LayerType_ELTWISE: - return "Eltwise"; - case V1LayerParameter_LayerType_EXP: - return "Exp"; - case V1LayerParameter_LayerType_FLATTEN: - return "Flatten"; - case V1LayerParameter_LayerType_HDF5_DATA: - return "HDF5Data"; - case V1LayerParameter_LayerType_HDF5_OUTPUT: - return "HDF5Output"; - case V1LayerParameter_LayerType_HINGE_LOSS: - return "HingeLoss"; - case V1LayerParameter_LayerType_IM2COL: - return "Im2col"; - case V1LayerParameter_LayerType_IMAGE_DATA: - return "ImageData"; - case V1LayerParameter_LayerType_INFOGAIN_LOSS: - return "InfogainLoss"; - case V1LayerParameter_LayerType_INNER_PRODUCT: - return "InnerProduct"; - case V1LayerParameter_LayerType_LRN: - return "LRN"; - case V1LayerParameter_LayerType_MEMORY_DATA: - return "MemoryData"; - case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS: - return "MultinomialLogisticLoss"; - case V1LayerParameter_LayerType_MVN: - return "MVN"; - case V1LayerParameter_LayerType_POOLING: - return "Pooling"; - case V1LayerParameter_LayerType_POWER: - return "Power"; - case V1LayerParameter_LayerType_RELU: - return "ReLU"; - case V1LayerParameter_LayerType_SIGMOID: - return "Sigmoid"; - case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS: - return "SigmoidCrossEntropyLoss"; - case V1LayerParameter_LayerType_SILENCE: - return "Silence"; - case V1LayerParameter_LayerType_SOFTMAX: - return "Softmax"; - case V1LayerParameter_LayerType_SOFTMAX_LOSS: - return "SoftmaxWithLoss"; - case V1LayerParameter_LayerType_SPLIT: - return "Split"; - case V1LayerParameter_LayerType_SLICE: - return "Slice"; - case V1LayerParameter_LayerType_TANH: - return "TanH"; - case V1LayerParameter_LayerType_WINDOW_DATA: - return "WindowData"; - case V1LayerParameter_LayerType_THRESHOLD: - return "Threshold"; - default: - LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type; - return ""; - } + switch (type) { + case V1LayerParameter_LayerType_NONE: + return ""; + case V1LayerParameter_LayerType_ABSVAL: + return "AbsVal"; + case V1LayerParameter_LayerType_ACCURACY: + return "Accuracy"; + case V1LayerParameter_LayerType_ARGMAX: + return "ArgMax"; + case V1LayerParameter_LayerType_BNLL: + return "BNLL"; + case V1LayerParameter_LayerType_CONCAT: + return "Concat"; + case V1LayerParameter_LayerType_CONTRASTIVE_LOSS: + return "ContrastiveLoss"; + case V1LayerParameter_LayerType_CONVOLUTION: + return "Convolution"; + case V1LayerParameter_LayerType_DECONVOLUTION: + return "Deconvolution"; + case V1LayerParameter_LayerType_DATA: + return "Data"; + case V1LayerParameter_LayerType_DROPOUT: + return "Dropout"; + case V1LayerParameter_LayerType_DUMMY_DATA: + return "DummyData"; + case V1LayerParameter_LayerType_EUCLIDEAN_LOSS: + return "EuclideanLoss"; + case V1LayerParameter_LayerType_ELTWISE: + return "Eltwise"; + case V1LayerParameter_LayerType_EXP: + return "Exp"; + case V1LayerParameter_LayerType_FLATTEN: + return "Flatten"; + case V1LayerParameter_LayerType_HDF5_DATA: + return "HDF5Data"; + case V1LayerParameter_LayerType_HDF5_OUTPUT: + return "HDF5Output"; + case V1LayerParameter_LayerType_HINGE_LOSS: + return "HingeLoss"; + case V1LayerParameter_LayerType_IM2COL: + return "Im2col"; + case V1LayerParameter_LayerType_IMAGE_DATA: + return "ImageData"; + case V1LayerParameter_LayerType_INFOGAIN_LOSS: + return "InfogainLoss"; + case V1LayerParameter_LayerType_INNER_PRODUCT: + return "InnerProduct"; + case V1LayerParameter_LayerType_LRN: + return "LRN"; + case V1LayerParameter_LayerType_MEMORY_DATA: + return "MemoryData"; + case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS: + return "MultinomialLogisticLoss"; + case V1LayerParameter_LayerType_MVN: + return "MVN"; + case V1LayerParameter_LayerType_POOLING: + return "Pooling"; + case V1LayerParameter_LayerType_POWER: + return "Power"; + case V1LayerParameter_LayerType_RELU: + return "ReLU"; + case V1LayerParameter_LayerType_SIGMOID: + return "Sigmoid"; + case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS: + return "SigmoidCrossEntropyLoss"; + case V1LayerParameter_LayerType_SILENCE: + return "Silence"; + case V1LayerParameter_LayerType_SOFTMAX: + return "Softmax"; + case V1LayerParameter_LayerType_SOFTMAX_LOSS: + return "SoftmaxWithLoss"; + case V1LayerParameter_LayerType_SPLIT: + return "Split"; + case V1LayerParameter_LayerType_SLICE: + return "Slice"; + case V1LayerParameter_LayerType_TANH: + return "TanH"; + case V1LayerParameter_LayerType_WINDOW_DATA: + return "WindowData"; + case V1LayerParameter_LayerType_THRESHOLD: + return "Threshold"; + default: + LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type; + return ""; + } } void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param) { - CHECK(ReadProtoFromTextFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; - UpgradeNetAsNeeded(param_file, param); + NetParameter* param) { + CHECK(ReadProtoFromTextFile(param_file, param)) + << "Failed to parse NetParameter file: " << param_file; + UpgradeNetAsNeeded(param_file, param); } void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param) { - CHECK(ReadProtoFromBinaryFile(param_file, param)) - << "Failed to parse NetParameter file: " << param_file; - UpgradeNetAsNeeded(param_file, param); + NetParameter* param) { + CHECK(ReadProtoFromBinaryFile(param_file, param)) + << "Failed to parse NetParameter file: " << param_file; + UpgradeNetAsNeeded(param_file, param); } } // namespace caffe From ae39d5df509451a28b9e920bbd9cfc3b0aea54ad Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 12 Sep 2015 01:48:39 +0800 Subject: [PATCH 099/124] Passed dropout unit test --- include/caffe/common.hpp | 2 +- include/caffe/neuron_layers.hpp | 6 -- include/caffe/util/ocl_wrapper.hpp | 8 ++- src/caffe/common.cpp | 1 + src/caffe/layers/dropout_layer.cpp | 95 ++++++++++-------------------- src/caffe/ocl/dropout_layer.cl | 22 +++---- src/caffe/util/math_functions.cpp | 1 + src/caffe/util/ocl_wrapper.cpp | 41 +++++++------ 8 files changed, 73 insertions(+), 103 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 0f3a7667..8993af45 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -73,7 +73,7 @@ private:\ #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet" //OpenCL: various of defines to choose the design schemes /* ifdef: use CPU random generator in dropout layer - ifndef: use GPU randome generator*/ + ifndef: use GPU random generator*/ //#define use_cpu_generator_dropout //#define print_memory_trace //the following are macro defines for optimization schmes in conv layer diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 89b6c481..dfbaa199 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -190,12 +190,6 @@ class DropoutLayer: public NeuronLayer { virtual inline const char* type() const { return "Dropout"; } - virtual ~DropoutLayer(); - void ocl_setup(int bottom_count); - cl_mem MaskMem; - cl_kernel ocl_Kernel_Fwd; - cl_kernel ocl_Kernel_Bwd; - cl_kernel rng_kernel; protected: /** diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 869bc83b..5fe5ab9e 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -205,16 +205,18 @@ void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); template void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype *top_data); + const unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype *top_data); template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff); +void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem, + const unsigned int threshold_, const float scale_, Dtype* bottom_diff); template void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold); +void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0); + template void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 2157c96a..20799433 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -105,6 +105,7 @@ Caffe::~Caffe() { void Caffe::set_random_seed(const unsigned int seed) { // RNG seed Get().random_generator_.reset(new RNG(seed)); + caffe_gpu_uniform(0, NULL, seed); } void Caffe::SetDevice(const int device_id) { diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index de8f5607..05de4944 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -11,15 +11,6 @@ namespace caffe { template -void DropoutLayer::ocl_setup(int bottom_count) { - MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, - bottom_count * sizeof(int), NULL, NULL); -} - -template -DropoutLayer::~DropoutLayer() { - OCL_CHECK (clReleaseMemObject(MaskMem) ); - }template void DropoutLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { NeuronLayer < Dtype > ::LayerSetUp(bottom, top); @@ -28,7 +19,6 @@ void DropoutLayer::LayerSetUp(const vector*>& bottom, DCHECK(threshold_ < 1.); scale_ = 1. / (1. - threshold_); uint_thres_ = static_cast(UINT_MAX * threshold_); - ocl_setup(bottom[0]->count()); } template @@ -77,69 +67,44 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } -#define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\ -do{ \ - int *global_mem_cpu = new int[count]; \ - clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \ - CL_TRUE, 0, sizeof(int)*count, global_mem_cpu,0, NULL, NULL); \ - size_t sample_interval = count/num; \ - if(sample_interval == 0){ \ - sample_interval=1; \ - } \ - printf("%s: ", marker); \ - for(int i=0; i void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - //unsigned int* mask = - // static_cast(rand_vec_.mutable_gpu_data()); -#ifdef use_cpu_generator_dropout - unsigned int* mask_cpu = - static_cast(rand_vec_.mutable_cpu_data()); - caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu); - OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) ); - DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data); -#else - caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1., - threshold_); - DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_, - top_data); -#endif - } else { - caffe_gpu_copy(count, bottom_data, top_data); - } -CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + unsigned int* mask = + static_cast(rand_vec_.mutable_gpu_data()); + caffe_gpu_rng_uniform(count, mask); + // set thresholds + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data); + } else { + caffe_gpu_copy(count, bottom_data, top_data); + } } template void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const int count = bottom[0]->count(); - DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_, - (Dtype) scale_, bottom_diff); - } else { - caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); - } - CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask"); - CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff"); - } + const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (this->phase_ == TRAIN) { + const unsigned int* mask = + static_cast(rand_vec_.gpu_data()); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff); + } else { + caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); + } + } } + #ifdef CPU_ONLY STUB_GPU(DropoutLayer); #endif diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl index bb2fc696..98d44f86 100644 --- a/src/caffe/ocl/dropout_layer.cl +++ b/src/caffe/ocl/dropout_layer.cl @@ -25,19 +25,21 @@ **************************************************************************************/ template -__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) { +__kernel void DropoutForward(const int n, __global T *in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global T *out) { int index = get_global_id(0); - if (index < n) - out[index] = in[index] * scale * mask[index]; + if (index < n) { + out[index] = in[index] * scale * (mask[index] > threshold); + } } -template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out); -template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out); +template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out); template -__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) { +__kernel void DropoutBackward(const int n, __global T *in_diff, __global const unsigned int *mask, const unsigned int threshold, const float scale, __global T *out_diff) { int index = get_global_id(0); - if (index < n) - out_diff[index] = in_diff[index] * scale * mask[index]; + if (index < n) { + out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); + } } -template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); -template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff); +template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out_diff); diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 3275d75c..6b76a9ef 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -886,6 +886,7 @@ uint32_t caffe_gpu_hamming_distance(const int n, const double* x, } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { + caffe_gpu_uniform(n, r); } template <> diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 75b69215..29a12330 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -145,12 +145,16 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup); template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup); -void caffe_gpu_uniform(const unsigned int n, unsigned int *r) +void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed) { + static unsigned c = 0; + if ((n == 0) || (r == NULL)) { + c = _seed; + return; + } std::string kernel_name = "PRNG_threefry4x32_uint_uniform"; cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - static unsigned c = 0; unsigned nrounds = 20; array4x32 rndctr4; rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; @@ -1692,33 +1696,33 @@ template void caffe_gpu_powx(const int n, const double* a, template void DropoutForward(const int count, const Dtype* bottom_data, - const int* MaskMem, const Dtype scale_, Dtype* top_data) { + const unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype* top_data) { std::string kernel_name = "DropoutForward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); cl_int ret; - ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_); - ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &top_data); OCL_CHECK(ret); size_t Global_Work_Size[] = { (size_t) count }; size_t Local_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void DropoutForward(const int count, const float* bottom_data, - const int* MaskMem, const float scale_, float* top_data); + const unsigned int* MaskMem, const unsigned int threshold, const float scale_, float* top_data); template void DropoutForward(const int count, const double* bottom_data, - const int* MaskMem, const double scale_, double* top_data); + const unsigned int* MaskMem, const unsigned int threshold, const float scale_, double* top_data); template -void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, - const float threshold_, const Dtype scale_, Dtype* bottom_diff) { +void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem, + const unsigned int threshold_, const float scale_, Dtype* bottom_diff) { std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); cl_kernel kernel = amdDevice.GetKernel(kernel_name); @@ -1726,8 +1730,8 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); - ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_); - ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold_); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_); ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); OCL_CHECK(ret); @@ -1738,10 +1742,10 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); } template void DropoutBackward(const int count, const float* top_diff, - const int* MaskMem, const float threshold_, const float scale_, + const unsigned int* MaskMem, const unsigned int threshold_, const float scale_, float* bottom_diff); template void DropoutBackward(const int count, const double* top_diff, - const int* MaskMem, const float threshold_, const double scale_, + const unsigned int* MaskMem, const unsigned int threshold_, const float scale_, double* bottom_diff); template @@ -1927,7 +1931,8 @@ template void ocl_conv(float* bottom_data, float* top_data, int stride, int pad, int batch_sz); template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, - int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, - int stride, int pad, int batch_sz); + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); } // namespace caffe + From cb7cd7bbde88907e0a97d5499fc9c4ba07cd0767 Mon Sep 17 00:00:00 2001 From: Junli Date: Sat, 12 Sep 2015 21:55:30 -0700 Subject: [PATCH 100/124] removed cmakefiles from git repo --- .../CMakeDirectoryInformation.cmake | 16 - .../CMakeFiles/caffe.dir/DependInfo.cmake | 108 - src/caffe/CMakeFiles/caffe.dir/build.make | 2542 ----------------- .../CMakeFiles/caffe.dir/cmake_clean.cmake | 126 - src/caffe/CMakeFiles/caffe.dir/depend.make | 2 - src/caffe/CMakeFiles/caffe.dir/flags.make | 8 - src/caffe/CMakeFiles/caffe.dir/link.txt | 1 - src/caffe/CMakeFiles/caffe.dir/progress.make | 118 - ..._compile_generated_absval_layer.cu.o.cmake | 296 -- ...compile_generated_absval_layer.cu.o.depend | 1 - ...mpile_generated_base_data_layer.cu.o.cmake | 296 -- ...pile_generated_base_data_layer.cu.o.depend | 1 - ...da_compile_generated_bnll_layer.cu.o.cmake | 296 -- ...a_compile_generated_bnll_layer.cu.o.depend | 1 - ..._compile_generated_concat_layer.cu.o.cmake | 296 -- ...compile_generated_concat_layer.cu.o.depend | 1 - ...enerated_contrastive_loss_layer.cu.o.cmake | 296 -- ...nerated_contrastive_loss_layer.cu.o.depend | 1 - ...da_compile_generated_conv_layer.cu.o.cmake | 296 -- ...a_compile_generated_conv_layer.cu.o.depend | 1 - ...pile_generated_cudnn_conv_layer.cu.o.cmake | 296 -- ...ile_generated_cudnn_conv_layer.cu.o.depend | 1 - ...e_generated_cudnn_pooling_layer.cu.o.cmake | 296 -- ..._generated_cudnn_pooling_layer.cu.o.depend | 1 - ...pile_generated_cudnn_relu_layer.cu.o.cmake | 296 -- ...ile_generated_cudnn_relu_layer.cu.o.depend | 1 - ...e_generated_cudnn_sigmoid_layer.cu.o.cmake | 296 -- ..._generated_cudnn_sigmoid_layer.cu.o.depend | 1 - ...e_generated_cudnn_softmax_layer.cu.o.cmake | 296 -- ..._generated_cudnn_softmax_layer.cu.o.depend | 1 - ...pile_generated_cudnn_tanh_layer.cu.o.cmake | 296 -- ...ile_generated_cudnn_tanh_layer.cu.o.depend | 1 - ..._compile_generated_deconv_layer.cu.o.cmake | 296 -- ...compile_generated_deconv_layer.cu.o.depend | 1 - ...compile_generated_dropout_layer.cu.o.cmake | 296 -- ...ompile_generated_dropout_layer.cu.o.depend | 1 - ...compile_generated_eltwise_layer.cu.o.cmake | 296 -- ...ompile_generated_eltwise_layer.cu.o.depend | 1 - ..._generated_euclidean_loss_layer.cu.o.cmake | 296 -- ...generated_euclidean_loss_layer.cu.o.depend | 1 - ...uda_compile_generated_exp_layer.cu.o.cmake | 296 -- ...da_compile_generated_exp_layer.cu.o.depend | 1 - ..._compile_generated_filter_layer.cu.o.cmake | 296 -- ...compile_generated_filter_layer.cu.o.depend | 1 - ...mpile_generated_hdf5_data_layer.cu.o.cmake | 296 -- ...pile_generated_hdf5_data_layer.cu.o.depend | 1 - ...ile_generated_hdf5_output_layer.cu.o.cmake | 296 -- ...le_generated_hdf5_output_layer.cu.o.depend | 1 - ..._compile_generated_im2col_layer.cu.o.cmake | 296 -- ...compile_generated_im2col_layer.cu.o.depend | 1 - ...e_generated_inner_product_layer.cu.o.cmake | 296 -- ..._generated_inner_product_layer.cu.o.depend | 1 - ...uda_compile_generated_log_layer.cu.o.cmake | 296 -- ...da_compile_generated_log_layer.cu.o.depend | 1 - ...uda_compile_generated_lrn_layer.cu.o.cmake | 296 -- ...da_compile_generated_lrn_layer.cu.o.depend | 1 - ...uda_compile_generated_mvn_layer.cu.o.cmake | 296 -- ...da_compile_generated_mvn_layer.cu.o.depend | 1 - ...compile_generated_pooling_layer.cu.o.cmake | 296 -- ...ompile_generated_pooling_layer.cu.o.depend | 1 - ...a_compile_generated_power_layer.cu.o.cmake | 296 -- ..._compile_generated_power_layer.cu.o.depend | 1 - ...a_compile_generated_prelu_layer.cu.o.cmake | 296 -- ..._compile_generated_prelu_layer.cu.o.depend | 1 - ...mpile_generated_reduction_layer.cu.o.cmake | 296 -- ...pile_generated_reduction_layer.cu.o.depend | 1 - ...da_compile_generated_relu_layer.cu.o.cmake | 296 -- ...a_compile_generated_relu_layer.cu.o.depend | 1 - ...igmoid_cross_entropy_loss_layer.cu.o.cmake | 296 -- ...gmoid_cross_entropy_loss_layer.cu.o.depend | 470 --- ...compile_generated_sigmoid_layer.cu.o.cmake | 296 -- ...ompile_generated_sigmoid_layer.cu.o.depend | 468 --- ...compile_generated_silence_layer.cu.o.cmake | 296 -- ...ompile_generated_silence_layer.cu.o.depend | 1 - ...a_compile_generated_slice_layer.cu.o.cmake | 296 -- ..._compile_generated_slice_layer.cu.o.depend | 1 - ...compile_generated_softmax_layer.cu.o.cmake | 296 -- ...ompile_generated_softmax_layer.cu.o.depend | 1 - ...le_generated_softmax_loss_layer.cu.o.cmake | 296 -- ...e_generated_softmax_loss_layer.cu.o.depend | 1 - ...a_compile_generated_split_layer.cu.o.cmake | 296 -- ..._compile_generated_split_layer.cu.o.depend | 1 - ...da_compile_generated_tanh_layer.cu.o.cmake | 296 -- ...a_compile_generated_tanh_layer.cu.o.depend | 1 - ...mpile_generated_threshold_layer.cu.o.cmake | 296 -- ...pile_generated_threshold_layer.cu.o.depend | 1 - .../cuda_compile_generated_im2col.cu.o.cmake | 296 -- .../cuda_compile_generated_im2col.cu.o.depend | 404 --- ...ompile_generated_math_functions.cu.o.cmake | 296 -- ...mpile_generated_math_functions.cu.o.depend | 744 ----- src/caffe/CMakeFiles/progress.marks | 1 - .../CMakeFiles/proto.dir/CXX.includecache | 48 - .../CMakeFiles/proto.dir/DependInfo.cmake | 39 - src/caffe/CMakeFiles/proto.dir/build.make | 119 - .../CMakeFiles/proto.dir/cmake_clean.cmake | 13 - .../proto.dir/cmake_clean_target.cmake | 3 - .../CMakeFiles/proto.dir/depend.internal | 6 - src/caffe/CMakeFiles/proto.dir/depend.make | 6 - src/caffe/CMakeFiles/proto.dir/flags.make | 8 - src/caffe/CMakeFiles/proto.dir/link.txt | 2 - src/caffe/CMakeFiles/proto.dir/progress.make | 3 - src/caffe/CMakeLists.txt | 36 - 102 files changed, 17464 deletions(-) delete mode 100644 src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake delete mode 100644 src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake delete mode 100644 src/caffe/CMakeFiles/caffe.dir/build.make delete mode 100644 src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake delete mode 100644 src/caffe/CMakeFiles/caffe.dir/depend.make delete mode 100644 src/caffe/CMakeFiles/caffe.dir/flags.make delete mode 100644 src/caffe/CMakeFiles/caffe.dir/link.txt delete mode 100644 src/caffe/CMakeFiles/caffe.dir/progress.make delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend delete mode 100644 src/caffe/CMakeFiles/progress.marks delete mode 100644 src/caffe/CMakeFiles/proto.dir/CXX.includecache delete mode 100644 src/caffe/CMakeFiles/proto.dir/DependInfo.cmake delete mode 100644 src/caffe/CMakeFiles/proto.dir/build.make delete mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake delete mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake delete mode 100644 src/caffe/CMakeFiles/proto.dir/depend.internal delete mode 100644 src/caffe/CMakeFiles/proto.dir/depend.make delete mode 100644 src/caffe/CMakeFiles/proto.dir/flags.make delete mode 100644 src/caffe/CMakeFiles/proto.dir/link.txt delete mode 100644 src/caffe/CMakeFiles/proto.dir/progress.make delete mode 100644 src/caffe/CMakeLists.txt diff --git a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake deleted file mode 100644 index 7bb0014c..00000000 --- a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake +++ /dev/null @@ -1,16 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# Relative path conversion top directories. -SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe") -SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe") - -# Force unix paths in dependencies. -SET(CMAKE_FORCE_UNIX_PATHS 1) - - -# The C and CXX include file regular expressions for this directory. -SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$") -SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$") -SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN}) -SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN}) diff --git a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake deleted file mode 100644 index 1678bc46..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake +++ /dev/null @@ -1,108 +0,0 @@ -# The set of languages for which implicit dependencies are needed: -SET(CMAKE_DEPENDS_LANGUAGES - "CXX" - ) -# The set of files for implicit dependencies of each language: -SET(CMAKE_DEPENDS_CHECK_CXX - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/blob.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/common.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/device.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/net.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/solver.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" - ) -SET(CMAKE_CXX_COMPILER_ID "GNU") - -# Preprocessor definitions for this target. -SET(CMAKE_TARGET_DEFINITIONS - "GTEST_USE_OWN_TR1_TUPLE" - ) - -# Targets to which this target links. -SET(CMAKE_TARGET_LINKED_INFO_FILES - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake" - ) - -# The include file search paths: -SET(CMAKE_C_TARGET_INCLUDE_PATH - "src" - "/usr/local/include" - "include" - "/usr/local/cuda/include" - "/usr/local/include/opencv" - "/usr/include/atlas" - "." - ) -SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/CMakeFiles/caffe.dir/build.make b/src/caffe/CMakeFiles/caffe.dir/build.make deleted file mode 100644 index 916913ae..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/build.make +++ /dev/null @@ -1,2542 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Remove some rules from gmake that .SUFFIXES does not remove. -SUFFIXES = - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E remove -f - -# Escaping for special characters. -EQUALS = = - -# The program to use to edit the cache. -CMAKE_EDIT_COMMAND = /usr/bin/ccmake - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# Include any dependencies generated for this target. -include src/caffe/CMakeFiles/caffe.dir/depend.make - -# Include the progress variables for this target. -include src/caffe/CMakeFiles/caffe.dir/progress.make - -# Include the compile flags for this target's objects. -include src/caffe/CMakeFiles/caffe.dir/flags.make - -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/util/math_functions.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/util/im2col.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/layers/cufiles/sigmoid_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/layers/cufiles/bnll_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/layers/cufiles/conv_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/layers/cufiles/pooling_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/layers/cufiles/log_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/layers/cufiles/reduction_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/layers/cufiles/silence_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/layers/cufiles/power_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/layers/cufiles/split_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/layers/cufiles/absval_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/layers/cufiles/hdf5_output_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/layers/cufiles/base_data_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/layers/cufiles/dropout_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/layers/cufiles/cudnn_tanh_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/layers/cufiles/relu_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/layers/cufiles/cudnn_conv_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/layers/cufiles/contrastive_loss_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/layers/cufiles/concat_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/layers/cufiles/softmax_loss_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/layers/cufiles/cudnn_softmax_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/layers/cufiles/inner_product_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/layers/cufiles/filter_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/layers/cufiles/prelu_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/layers/cufiles/im2col_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/layers/cufiles/hdf5_data_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/layers/cufiles/deconv_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/layers/cufiles/mvn_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/layers/cufiles/tanh_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/layers/cufiles/slice_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/layers/cufiles/threshold_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/layers/cufiles/lrn_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/layers/cufiles/eltwise_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/layers/cufiles/exp_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/layers/cufiles/euclidean_loss_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/layers/cufiles/cudnn_relu_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/layers/cufiles/cudnn_pooling_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/layers/cufiles/softmax_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake - -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake -src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake - -src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/common.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/common.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp - -src/caffe/CMakeFiles/caffe.dir/common.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/common.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp > CMakeFiles/caffe.dir/common.cpp.i - -src/caffe/CMakeFiles/caffe.dir/common.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/common.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp -o CMakeFiles/caffe.dir/common.cpp.s - -src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/common.cpp.o - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/blob.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/blob.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/blob.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp > CMakeFiles/caffe.dir/blob.cpp.i - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/blob.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp -o CMakeFiles/caffe.dir/blob.cpp.s - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/util/ocl_wrapper.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp > CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/util/im2col.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/im2col.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/im2col.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp > CMakeFiles/caffe.dir/util/im2col.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/im2col.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp -o CMakeFiles/caffe.dir/util/im2col.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/util/upgrade_proto.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp > CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/util/db_leveldb.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_leveldb.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp > CMakeFiles/caffe.dir/util/db_leveldb.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_leveldb.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/util/ocl_util.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_util.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_util.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp > CMakeFiles/caffe.dir/util/ocl_util.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_util.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp -o CMakeFiles/caffe.dir/util/ocl_util.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/util/insert_splits.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/insert_splits.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/insert_splits.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp > CMakeFiles/caffe.dir/util/insert_splits.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/insert_splits.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp -o CMakeFiles/caffe.dir/util/insert_splits.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/util/db_lmdb.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_lmdb.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp > CMakeFiles/caffe.dir/util/db_lmdb.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_lmdb.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/util/math_functions.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/math_functions.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp > CMakeFiles/caffe.dir/util/math_functions.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/math_functions.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp -o CMakeFiles/caffe.dir/util/math_functions.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/util/io.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/io.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp > CMakeFiles/caffe.dir/util/io.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/io.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp -o CMakeFiles/caffe.dir/util/io.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/util/cudnn.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/cudnn.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/cudnn.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp > CMakeFiles/caffe.dir/util/cudnn.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/cudnn.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp -o CMakeFiles/caffe.dir/util/cudnn.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/util/db.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp > CMakeFiles/caffe.dir/util/db.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp -o CMakeFiles/caffe.dir/util/db.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/util/benchmark.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/benchmark.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp > CMakeFiles/caffe.dir/util/benchmark.cpp.i - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/benchmark.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp -o CMakeFiles/caffe.dir/util/benchmark.cpp.s - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o - -src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/device.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/device.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/device.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp - -src/caffe/CMakeFiles/caffe.dir/device.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/device.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp > CMakeFiles/caffe.dir/device.cpp.i - -src/caffe/CMakeFiles/caffe.dir/device.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/device.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp -o CMakeFiles/caffe.dir/device.cpp.s - -src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/device.cpp.o - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/internal_thread.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/internal_thread.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp > CMakeFiles/caffe.dir/internal_thread.cpp.i - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/internal_thread.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp -o CMakeFiles/caffe.dir/internal_thread.cpp.s - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/data_transformer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/data_transformer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp > CMakeFiles/caffe.dir/data_transformer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/data_transformer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp -o CMakeFiles/caffe.dir/data_transformer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/net.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/net.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp - -src/caffe/CMakeFiles/caffe.dir/net.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/net.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp > CMakeFiles/caffe.dir/net.cpp.i - -src/caffe/CMakeFiles/caffe.dir/net.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/net.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp -o CMakeFiles/caffe.dir/net.cpp.s - -src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/net.cpp.o - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/solver.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_60) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/solver.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/solver.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp > CMakeFiles/caffe.dir/solver.cpp.i - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/solver.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp -o CMakeFiles/caffe.dir/solver.cpp.s - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/layer_factory.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_61) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layer_factory.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp > CMakeFiles/caffe.dir/layer_factory.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layer_factory.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp -o CMakeFiles/caffe.dir/layer_factory.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/syncedmem.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_62) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/syncedmem.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp > CMakeFiles/caffe.dir/syncedmem.cpp.i - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/syncedmem.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp -o CMakeFiles/caffe.dir/syncedmem.cpp.s - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/layers/deconv_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_63) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp > CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/layers/infogain_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_64) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp > CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/layers/log_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_65) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/log_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/log_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp > CMakeFiles/caffe.dir/layers/log_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/log_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp -o CMakeFiles/caffe.dir/layers/log_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/layers/base_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_66) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp > CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/layers/euclidean_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_67) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp > CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/layers/image_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_68) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp > CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/layers/sigmoid_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_69) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/layers/cudnn_softmax_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_70) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/layers/cudnn_tanh_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_71) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/layers/spp_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_72) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/spp_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp > CMakeFiles/caffe.dir/layers/spp_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/spp_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/layers/hdf5_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_73) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/layers/exp_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_74) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/exp_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp > CMakeFiles/caffe.dir/layers/exp_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/exp_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/layers/power_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_75) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/power_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp > CMakeFiles/caffe.dir/layers/power_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/power_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp -o CMakeFiles/caffe.dir/layers/power_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/layers/relu_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_76) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/relu_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp > CMakeFiles/caffe.dir/layers/relu_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/relu_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/layers/split_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_77) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/split_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp > CMakeFiles/caffe.dir/layers/split_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/split_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp -o CMakeFiles/caffe.dir/layers/split_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/layers/window_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_78) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp > CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/layers/dropout_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_79) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp > CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/layers/cudnn_sigmoid_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_80) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/layers/silence_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_81) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/silence_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp > CMakeFiles/caffe.dir/layers/silence_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/silence_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/layers/cudnn_pooling_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_82) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/layers/lrn_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_83) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp > CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/layers/memory_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_84) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp > CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/layers/mvn_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_85) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp > CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/layers/cudnn_relu_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_86) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/layers/slice_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_87) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/slice_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp > CMakeFiles/caffe.dir/layers/slice_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/slice_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/layers/pooling_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_88) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp > CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/layers/hdf5_output_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_89) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/layers/inner_product_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_90) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp > CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/layers/threshold_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_91) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp > CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/layers/reduction_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_92) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp > CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/layers/tanh_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_93) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp > CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/layers/prelu_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_94) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp > CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/layers/accuracy_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_95) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp > CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/layers/neuron_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_96) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp > CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/layers/absval_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_97) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/absval_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp > CMakeFiles/caffe.dir/layers/absval_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/absval_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/layers/loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_98) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp > CMakeFiles/caffe.dir/layers/loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/layers/softmax_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_99) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/layers/cudnn_conv_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_100) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_101) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/layers/concat_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_102) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/concat_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp > CMakeFiles/caffe.dir/layers/concat_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/concat_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/layers/hinge_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_103) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp > CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/layers/bnll_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_104) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp > CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/layers/flatten_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_105) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp > CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/layers/argmax_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_106) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp > CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/layers/filter_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_107) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/filter_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp > CMakeFiles/caffe.dir/layers/filter_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/filter_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/layers/dummy_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_108) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp > CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/layers/conv_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_109) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/conv_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp > CMakeFiles/caffe.dir/layers/conv_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/conv_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/layers/base_conv_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_110) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp > CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/layers/data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_111) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp > CMakeFiles/caffe.dir/layers/data_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp -o CMakeFiles/caffe.dir/layers/data_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/layers/softmax_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_112) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/layers/eltwise_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_113) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp > CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/layers/im2col_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_114) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp > CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/layers/multinomial_logistic_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_115) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp > CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/layers/contrastive_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_116) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp > CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/layers/reshape_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_117) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp > CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires: -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires - $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build -.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides - -src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o - -# Object files for target caffe -caffe_OBJECTS = \ -"CMakeFiles/caffe.dir/common.cpp.o" \ -"CMakeFiles/caffe.dir/blob.cpp.o" \ -"CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" \ -"CMakeFiles/caffe.dir/util/im2col.cpp.o" \ -"CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" \ -"CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" \ -"CMakeFiles/caffe.dir/util/ocl_util.cpp.o" \ -"CMakeFiles/caffe.dir/util/insert_splits.cpp.o" \ -"CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" \ -"CMakeFiles/caffe.dir/util/math_functions.cpp.o" \ -"CMakeFiles/caffe.dir/util/io.cpp.o" \ -"CMakeFiles/caffe.dir/util/cudnn.cpp.o" \ -"CMakeFiles/caffe.dir/util/db.cpp.o" \ -"CMakeFiles/caffe.dir/util/benchmark.cpp.o" \ -"CMakeFiles/caffe.dir/device.cpp.o" \ -"CMakeFiles/caffe.dir/internal_thread.cpp.o" \ -"CMakeFiles/caffe.dir/data_transformer.cpp.o" \ -"CMakeFiles/caffe.dir/net.cpp.o" \ -"CMakeFiles/caffe.dir/solver.cpp.o" \ -"CMakeFiles/caffe.dir/layer_factory.cpp.o" \ -"CMakeFiles/caffe.dir/syncedmem.cpp.o" \ -"CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/log_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/power_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/split_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/data_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" \ -"CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" - -# External object files for target caffe -caffe_EXTERNAL_OBJECTS = \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" - -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/common.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/device.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/net.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/build.make -lib/libcaffe.so: lib/libproto.a -lib/libcaffe.so: lib/libproto.a -lib/libcaffe.so: /usr/local/lib/libboost_system.so -lib/libcaffe.so: /usr/local/lib/libboost_thread.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libpthread.so -lib/libcaffe.so: /usr/local/lib/libglog.so -lib/libcaffe.so: /usr/local/lib/libgflags.a -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so -lib/libcaffe.so: /usr/local/lib/liblmdb.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so -lib/libcaffe.so: /usr/lib/libsnappy.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so -lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10 -lib/libcaffe.so: /usr/local/lib/libopencv_highgui.so.2.4.10 -lib/libcaffe.so: /usr/local/lib/libopencv_imgproc.so.2.4.10 -lib/libcaffe.so: /usr/lib/liblapack_atlas.so -lib/libcaffe.so: /usr/lib/libcblas.so -lib/libcaffe.so: /usr/lib/libatlas.so -lib/libcaffe.so: /usr/local/lib/libglog.so -lib/libcaffe.so: /usr/local/lib/libgflags.a -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so -lib/libcaffe.so: /usr/local/lib/liblmdb.so -lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so -lib/libcaffe.so: /usr/lib/libsnappy.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so -lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so -lib/libcaffe.so: /usr/lib/liblapack_atlas.so -lib/libcaffe.so: /usr/lib/libcblas.so -lib/libcaffe.so: /usr/lib/libatlas.so -lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10 -lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/link.txt - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX shared library ../../lib/libcaffe.so" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/caffe.dir/link.txt --verbose=$(VERBOSE) - -# Rule to build all files generated by this target. -src/caffe/CMakeFiles/caffe.dir/build: lib/libcaffe.so -.PHONY : src/caffe/CMakeFiles/caffe.dir/build - -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires -src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires -.PHONY : src/caffe/CMakeFiles/caffe.dir/requires - -src/caffe/CMakeFiles/caffe.dir/clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/caffe.dir/cmake_clean.cmake -.PHONY : src/caffe/CMakeFiles/caffe.dir/clean - -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake --color=$(COLOR) -.PHONY : src/caffe/CMakeFiles/caffe.dir/depend - diff --git a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake deleted file mode 100644 index 344db002..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake +++ /dev/null @@ -1,126 +0,0 @@ -FILE(REMOVE_RECURSE - "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" - "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" - "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o" - "CMakeFiles/caffe.dir/common.cpp.o" - "CMakeFiles/caffe.dir/blob.cpp.o" - "CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" - "CMakeFiles/caffe.dir/util/im2col.cpp.o" - "CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" - "CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" - "CMakeFiles/caffe.dir/util/ocl_util.cpp.o" - "CMakeFiles/caffe.dir/util/insert_splits.cpp.o" - "CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" - "CMakeFiles/caffe.dir/util/math_functions.cpp.o" - "CMakeFiles/caffe.dir/util/io.cpp.o" - "CMakeFiles/caffe.dir/util/cudnn.cpp.o" - "CMakeFiles/caffe.dir/util/db.cpp.o" - "CMakeFiles/caffe.dir/util/benchmark.cpp.o" - "CMakeFiles/caffe.dir/device.cpp.o" - "CMakeFiles/caffe.dir/internal_thread.cpp.o" - "CMakeFiles/caffe.dir/data_transformer.cpp.o" - "CMakeFiles/caffe.dir/net.cpp.o" - "CMakeFiles/caffe.dir/solver.cpp.o" - "CMakeFiles/caffe.dir/layer_factory.cpp.o" - "CMakeFiles/caffe.dir/syncedmem.cpp.o" - "CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/log_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/power_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/split_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/data_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" - "CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o" - "../../lib/libcaffe.pdb" - "../../lib/libcaffe.so" -) - -# Per-language clean rules from dependency scanning. -FOREACH(lang CXX) - INCLUDE(CMakeFiles/caffe.dir/cmake_clean_${lang}.cmake OPTIONAL) -ENDFOREACH(lang) diff --git a/src/caffe/CMakeFiles/caffe.dir/depend.make b/src/caffe/CMakeFiles/caffe.dir/depend.make deleted file mode 100644 index 0b20d16b..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/depend.make +++ /dev/null @@ -1,2 +0,0 @@ -# Empty dependencies file for caffe. -# This may be replaced when dependencies are built. diff --git a/src/caffe/CMakeFiles/caffe.dir/flags.make b/src/caffe/CMakeFiles/caffe.dir/flags.make deleted file mode 100644 index 494d36e8..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/flags.make +++ /dev/null @@ -1,8 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# compile CXX with /usr/bin/c++ -CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -fPIC -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe - -CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE -Dcaffe_EXPORTS - diff --git a/src/caffe/CMakeFiles/caffe.dir/link.txt b/src/caffe/CMakeFiles/caffe.dir/link.txt deleted file mode 100644 index 603d461f..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/link.txt +++ /dev/null @@ -1 +0,0 @@ -/usr/bin/c++ -fPIC -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -shared -Wl,-soname,libcaffe.so -o ../../lib/libcaffe.so CMakeFiles/caffe.dir/common.cpp.o CMakeFiles/caffe.dir/blob.cpp.o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o CMakeFiles/caffe.dir/util/im2col.cpp.o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o CMakeFiles/caffe.dir/util/ocl_util.cpp.o CMakeFiles/caffe.dir/util/insert_splits.cpp.o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o CMakeFiles/caffe.dir/util/math_functions.cpp.o CMakeFiles/caffe.dir/util/io.cpp.o CMakeFiles/caffe.dir/util/cudnn.cpp.o CMakeFiles/caffe.dir/util/db.cpp.o CMakeFiles/caffe.dir/util/benchmark.cpp.o CMakeFiles/caffe.dir/device.cpp.o CMakeFiles/caffe.dir/internal_thread.cpp.o CMakeFiles/caffe.dir/data_transformer.cpp.o CMakeFiles/caffe.dir/net.cpp.o CMakeFiles/caffe.dir/solver.cpp.o CMakeFiles/caffe.dir/layer_factory.cpp.o CMakeFiles/caffe.dir/syncedmem.cpp.o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/log_layer.cpp.o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o CMakeFiles/caffe.dir/layers/power_layer.cpp.o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o CMakeFiles/caffe.dir/layers/split_layer.cpp.o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/data_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -L/usr/local/cuda/lib64 -L/usr/local/lib ../../lib/libproto.a ../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_core.so.2.4.10 /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 -llapack_atlas -lcblas -latlas /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so -llapack_atlas -lcblas -latlas /usr/local/lib/libopencv_core.so.2.4.10 -Wl,-rpath,/usr/local/cuda/lib64:/usr/local/lib::::::::::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/src/caffe/CMakeFiles/caffe.dir/progress.make b/src/caffe/CMakeFiles/caffe.dir/progress.make deleted file mode 100644 index d53ba6a8..00000000 --- a/src/caffe/CMakeFiles/caffe.dir/progress.make +++ /dev/null @@ -1,118 +0,0 @@ -CMAKE_PROGRESS_1 = -CMAKE_PROGRESS_2 = 1 -CMAKE_PROGRESS_3 = -CMAKE_PROGRESS_4 = 2 -CMAKE_PROGRESS_5 = -CMAKE_PROGRESS_6 = 3 -CMAKE_PROGRESS_7 = -CMAKE_PROGRESS_8 = 4 -CMAKE_PROGRESS_9 = -CMAKE_PROGRESS_10 = 5 -CMAKE_PROGRESS_11 = -CMAKE_PROGRESS_12 = 6 -CMAKE_PROGRESS_13 = -CMAKE_PROGRESS_14 = 7 -CMAKE_PROGRESS_15 = -CMAKE_PROGRESS_16 = 8 -CMAKE_PROGRESS_17 = -CMAKE_PROGRESS_18 = 9 -CMAKE_PROGRESS_19 = -CMAKE_PROGRESS_20 = 10 -CMAKE_PROGRESS_21 = -CMAKE_PROGRESS_22 = 11 -CMAKE_PROGRESS_23 = -CMAKE_PROGRESS_24 = 12 -CMAKE_PROGRESS_25 = -CMAKE_PROGRESS_26 = 13 -CMAKE_PROGRESS_27 = -CMAKE_PROGRESS_28 = 14 -CMAKE_PROGRESS_29 = -CMAKE_PROGRESS_30 = 15 -CMAKE_PROGRESS_31 = -CMAKE_PROGRESS_32 = 16 -CMAKE_PROGRESS_33 = -CMAKE_PROGRESS_34 = 17 -CMAKE_PROGRESS_35 = -CMAKE_PROGRESS_36 = 18 -CMAKE_PROGRESS_37 = -CMAKE_PROGRESS_38 = 19 -CMAKE_PROGRESS_39 = -CMAKE_PROGRESS_40 = 20 -CMAKE_PROGRESS_41 = -CMAKE_PROGRESS_42 = 21 -CMAKE_PROGRESS_43 = -CMAKE_PROGRESS_44 = 22 -CMAKE_PROGRESS_45 = -CMAKE_PROGRESS_46 = 23 -CMAKE_PROGRESS_47 = -CMAKE_PROGRESS_48 = 24 -CMAKE_PROGRESS_49 = -CMAKE_PROGRESS_50 = 25 -CMAKE_PROGRESS_51 = -CMAKE_PROGRESS_52 = 26 -CMAKE_PROGRESS_53 = -CMAKE_PROGRESS_54 = 27 -CMAKE_PROGRESS_55 = -CMAKE_PROGRESS_56 = 28 -CMAKE_PROGRESS_57 = -CMAKE_PROGRESS_58 = 29 -CMAKE_PROGRESS_59 = -CMAKE_PROGRESS_60 = 30 -CMAKE_PROGRESS_61 = -CMAKE_PROGRESS_62 = 31 -CMAKE_PROGRESS_63 = -CMAKE_PROGRESS_64 = 32 -CMAKE_PROGRESS_65 = -CMAKE_PROGRESS_66 = 33 -CMAKE_PROGRESS_67 = 34 -CMAKE_PROGRESS_68 = -CMAKE_PROGRESS_69 = 35 -CMAKE_PROGRESS_70 = -CMAKE_PROGRESS_71 = 36 -CMAKE_PROGRESS_72 = -CMAKE_PROGRESS_73 = 37 -CMAKE_PROGRESS_74 = -CMAKE_PROGRESS_75 = 38 -CMAKE_PROGRESS_76 = -CMAKE_PROGRESS_77 = 39 -CMAKE_PROGRESS_78 = -CMAKE_PROGRESS_79 = 40 -CMAKE_PROGRESS_80 = -CMAKE_PROGRESS_81 = 41 -CMAKE_PROGRESS_82 = -CMAKE_PROGRESS_83 = 42 -CMAKE_PROGRESS_84 = -CMAKE_PROGRESS_85 = 43 -CMAKE_PROGRESS_86 = -CMAKE_PROGRESS_87 = 44 -CMAKE_PROGRESS_88 = -CMAKE_PROGRESS_89 = 45 -CMAKE_PROGRESS_90 = -CMAKE_PROGRESS_91 = 46 -CMAKE_PROGRESS_92 = -CMAKE_PROGRESS_93 = 47 -CMAKE_PROGRESS_94 = -CMAKE_PROGRESS_95 = 48 -CMAKE_PROGRESS_96 = -CMAKE_PROGRESS_97 = 49 -CMAKE_PROGRESS_98 = -CMAKE_PROGRESS_99 = 50 -CMAKE_PROGRESS_100 = -CMAKE_PROGRESS_101 = 51 -CMAKE_PROGRESS_102 = -CMAKE_PROGRESS_103 = 52 -CMAKE_PROGRESS_104 = -CMAKE_PROGRESS_105 = 53 -CMAKE_PROGRESS_106 = -CMAKE_PROGRESS_107 = 54 -CMAKE_PROGRESS_108 = -CMAKE_PROGRESS_109 = 55 -CMAKE_PROGRESS_110 = -CMAKE_PROGRESS_111 = 56 -CMAKE_PROGRESS_112 = -CMAKE_PROGRESS_113 = 57 -CMAKE_PROGRESS_114 = -CMAKE_PROGRESS_115 = 58 -CMAKE_PROGRESS_116 = -CMAKE_PROGRESS_117 = 59 - diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake deleted file mode 100644 index 2b3197e9..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/absval_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake deleted file mode 100644 index 5558d70f..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/base_data_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake deleted file mode 100644 index ae71cc72..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/bnll_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake deleted file mode 100644 index 48e8560a..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/concat_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake deleted file mode 100644 index c5f6dca9..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/contrastive_loss_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake deleted file mode 100644 index 311ad242..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/conv_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake deleted file mode 100644 index 06210cf1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_conv_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake deleted file mode 100644 index 8f7960d4..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_pooling_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake deleted file mode 100644 index 308889ee..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_relu_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake deleted file mode 100644 index d65ebd00..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake deleted file mode 100644 index 806067ce..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_softmax_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake deleted file mode 100644 index 7ace65eb..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_tanh_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake deleted file mode 100644 index bc67ea5b..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/deconv_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake deleted file mode 100644 index 5ff06e9f..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/dropout_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake deleted file mode 100644 index 44e91898..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/eltwise_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake deleted file mode 100644 index 98ee3de7..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/euclidean_loss_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake deleted file mode 100644 index 2402999e..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/exp_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake deleted file mode 100644 index 83a032df..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/filter_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake deleted file mode 100644 index a88ed54d..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_data_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake deleted file mode 100644 index 252b9dfd..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_output_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake deleted file mode 100644 index 6bda58ec..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/im2col_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake deleted file mode 100644 index eac6680c..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/inner_product_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake deleted file mode 100644 index d18371a0..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/log_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake deleted file mode 100644 index c3c715f8..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/lrn_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake deleted file mode 100644 index 663f4478..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/mvn_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake deleted file mode 100644 index 866d0f93..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/pooling_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake deleted file mode 100644 index c6c30190..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/power_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake deleted file mode 100644 index c64cff0e..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/prelu_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake deleted file mode 100644 index b926deab..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/reduction_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake deleted file mode 100644 index 27fda108..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/relu_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake deleted file mode 100644 index 63d7ac68..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend deleted file mode 100644 index a7e2268a..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend +++ /dev/null @@ -1,470 +0,0 @@ -# Generated by: make2cmake.cmake -SET(CUDA_NVCC_DEPEND - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu" - "/opt/clBLAS-private-april8/include/clBLAS-complex.h" - "/opt/clBLAS-private-april8/include/clBLAS.h" - "/usr/include/H5ACpublic.h" - "/usr/include/H5Apublic.h" - "/usr/include/H5Cpublic.h" - "/usr/include/H5Dpublic.h" - "/usr/include/H5Epubgen.h" - "/usr/include/H5Epublic.h" - "/usr/include/H5FDcore.h" - "/usr/include/H5FDdirect.h" - "/usr/include/H5FDfamily.h" - "/usr/include/H5FDlog.h" - "/usr/include/H5FDmpi.h" - "/usr/include/H5FDmpio.h" - "/usr/include/H5FDmpiposix.h" - "/usr/include/H5FDmulti.h" - "/usr/include/H5FDpublic.h" - "/usr/include/H5FDsec2.h" - "/usr/include/H5FDstdio.h" - "/usr/include/H5Fpublic.h" - "/usr/include/H5Gpublic.h" - "/usr/include/H5Ipublic.h" - "/usr/include/H5Lpublic.h" - "/usr/include/H5MMpublic.h" - "/usr/include/H5Opublic.h" - "/usr/include/H5Ppublic.h" - "/usr/include/H5Rpublic.h" - "/usr/include/H5Spublic.h" - "/usr/include/H5Tpublic.h" - "/usr/include/H5Zpublic.h" - "/usr/include/H5api_adpt.h" - "/usr/include/H5pubconf.h" - "/usr/include/H5public.h" - "/usr/include/H5version.h" - "/usr/include/_G_config.h" - "/usr/include/alloca.h" - "/usr/include/asm-generic/errno-base.h" - "/usr/include/asm-generic/errno.h" - "/usr/include/assert.h" - "/usr/include/atlas/cblas.h" - "/usr/include/c++/4.8/algorithm" - "/usr/include/c++/4.8/backward/auto_ptr.h" - "/usr/include/c++/4.8/backward/binders.h" - "/usr/include/c++/4.8/bits/algorithmfwd.h" - "/usr/include/c++/4.8/bits/allocator.h" - "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" - "/usr/include/c++/4.8/bits/basic_ios.h" - "/usr/include/c++/4.8/bits/basic_ios.tcc" - "/usr/include/c++/4.8/bits/basic_string.h" - "/usr/include/c++/4.8/bits/basic_string.tcc" - "/usr/include/c++/4.8/bits/char_traits.h" - "/usr/include/c++/4.8/bits/codecvt.h" - "/usr/include/c++/4.8/bits/concept_check.h" - "/usr/include/c++/4.8/bits/cpp_type_traits.h" - "/usr/include/c++/4.8/bits/cxxabi_forced.h" - "/usr/include/c++/4.8/bits/exception_defines.h" - "/usr/include/c++/4.8/bits/fstream.tcc" - "/usr/include/c++/4.8/bits/functexcept.h" - "/usr/include/c++/4.8/bits/ios_base.h" - "/usr/include/c++/4.8/bits/istream.tcc" - "/usr/include/c++/4.8/bits/locale_classes.h" - "/usr/include/c++/4.8/bits/locale_classes.tcc" - "/usr/include/c++/4.8/bits/locale_facets.h" - "/usr/include/c++/4.8/bits/locale_facets.tcc" - "/usr/include/c++/4.8/bits/localefwd.h" - "/usr/include/c++/4.8/bits/memoryfwd.h" - "/usr/include/c++/4.8/bits/move.h" - "/usr/include/c++/4.8/bits/ostream.tcc" - "/usr/include/c++/4.8/bits/ostream_insert.h" - "/usr/include/c++/4.8/bits/postypes.h" - "/usr/include/c++/4.8/bits/range_access.h" - "/usr/include/c++/4.8/bits/sstream.tcc" - "/usr/include/c++/4.8/bits/stl_algo.h" - "/usr/include/c++/4.8/bits/stl_algobase.h" - "/usr/include/c++/4.8/bits/stl_bvector.h" - "/usr/include/c++/4.8/bits/stl_construct.h" - "/usr/include/c++/4.8/bits/stl_function.h" - "/usr/include/c++/4.8/bits/stl_heap.h" - "/usr/include/c++/4.8/bits/stl_iterator.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" - "/usr/include/c++/4.8/bits/stl_map.h" - "/usr/include/c++/4.8/bits/stl_multimap.h" - "/usr/include/c++/4.8/bits/stl_multiset.h" - "/usr/include/c++/4.8/bits/stl_pair.h" - "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" - "/usr/include/c++/4.8/bits/stl_relops.h" - "/usr/include/c++/4.8/bits/stl_set.h" - "/usr/include/c++/4.8/bits/stl_tempbuf.h" - "/usr/include/c++/4.8/bits/stl_tree.h" - "/usr/include/c++/4.8/bits/stl_uninitialized.h" - "/usr/include/c++/4.8/bits/stl_vector.h" - "/usr/include/c++/4.8/bits/stream_iterator.h" - "/usr/include/c++/4.8/bits/streambuf.tcc" - "/usr/include/c++/4.8/bits/streambuf_iterator.h" - "/usr/include/c++/4.8/bits/stringfwd.h" - "/usr/include/c++/4.8/bits/vector.tcc" - "/usr/include/c++/4.8/cctype" - "/usr/include/c++/4.8/cfloat" - "/usr/include/c++/4.8/climits" - "/usr/include/c++/4.8/clocale" - "/usr/include/c++/4.8/cmath" - "/usr/include/c++/4.8/cstddef" - "/usr/include/c++/4.8/cstdio" - "/usr/include/c++/4.8/cstdlib" - "/usr/include/c++/4.8/cwchar" - "/usr/include/c++/4.8/cwctype" - "/usr/include/c++/4.8/cxxabi.h" - "/usr/include/c++/4.8/debug/debug.h" - "/usr/include/c++/4.8/exception" - "/usr/include/c++/4.8/ext/alloc_traits.h" - "/usr/include/c++/4.8/ext/atomicity.h" - "/usr/include/c++/4.8/ext/new_allocator.h" - "/usr/include/c++/4.8/ext/numeric_traits.h" - "/usr/include/c++/4.8/ext/type_traits.h" - "/usr/include/c++/4.8/fstream" - "/usr/include/c++/4.8/functional" - "/usr/include/c++/4.8/ios" - "/usr/include/c++/4.8/iosfwd" - "/usr/include/c++/4.8/iostream" - "/usr/include/c++/4.8/istream" - "/usr/include/c++/4.8/iterator" - "/usr/include/c++/4.8/map" - "/usr/include/c++/4.8/memory" - "/usr/include/c++/4.8/new" - "/usr/include/c++/4.8/ostream" - "/usr/include/c++/4.8/set" - "/usr/include/c++/4.8/sstream" - "/usr/include/c++/4.8/streambuf" - "/usr/include/c++/4.8/string" - "/usr/include/c++/4.8/typeinfo" - "/usr/include/c++/4.8/utility" - "/usr/include/c++/4.8/vector" - "/usr/include/ctype.h" - "/usr/include/endian.h" - "/usr/include/errno.h" - "/usr/include/features.h" - "/usr/include/getopt.h" - "/usr/include/google/protobuf/descriptor.h" - "/usr/include/google/protobuf/extension_set.h" - "/usr/include/google/protobuf/generated_enum_reflection.h" - "/usr/include/google/protobuf/generated_message_util.h" - "/usr/include/google/protobuf/message.h" - "/usr/include/google/protobuf/message_lite.h" - "/usr/include/google/protobuf/repeated_field.h" - "/usr/include/google/protobuf/stubs/common.h" - "/usr/include/google/protobuf/stubs/template_util.h" - "/usr/include/google/protobuf/stubs/type_traits.h" - "/usr/include/google/protobuf/unknown_field_set.h" - "/usr/include/hdf5.h" - "/usr/include/inttypes.h" - "/usr/include/libio.h" - "/usr/include/limits.h" - "/usr/include/linux/errno.h" - "/usr/include/linux/limits.h" - "/usr/include/locale.h" - "/usr/include/math.h" - "/usr/include/pthread.h" - "/usr/include/sched.h" - "/usr/include/stdc-predef.h" - "/usr/include/stdint.h" - "/usr/include/stdio.h" - "/usr/include/stdlib.h" - "/usr/include/string.h" - "/usr/include/time.h" - "/usr/include/unistd.h" - "/usr/include/wchar.h" - "/usr/include/wctype.h" - "/usr/include/x86_64-linux-gnu/asm/errno.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap.h" - "/usr/include/x86_64-linux-gnu/bits/confname.h" - "/usr/include/x86_64-linux-gnu/bits/endian.h" - "/usr/include/x86_64-linux-gnu/bits/environments.h" - "/usr/include/x86_64-linux-gnu/bits/errno.h" - "/usr/include/x86_64-linux-gnu/bits/huge_val.h" - "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" - "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" - "/usr/include/x86_64-linux-gnu/bits/inf.h" - "/usr/include/x86_64-linux-gnu/bits/local_lim.h" - "/usr/include/x86_64-linux-gnu/bits/locale.h" - "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" - "/usr/include/x86_64-linux-gnu/bits/mathdef.h" - "/usr/include/x86_64-linux-gnu/bits/mathinline.h" - "/usr/include/x86_64-linux-gnu/bits/nan.h" - "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" - "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" - "/usr/include/x86_64-linux-gnu/bits/sched.h" - "/usr/include/x86_64-linux-gnu/bits/select.h" - "/usr/include/x86_64-linux-gnu/bits/select2.h" - "/usr/include/x86_64-linux-gnu/bits/setjmp.h" - "/usr/include/x86_64-linux-gnu/bits/sigset.h" - "/usr/include/x86_64-linux-gnu/bits/stdio.h" - "/usr/include/x86_64-linux-gnu/bits/stdio2.h" - "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib.h" - "/usr/include/x86_64-linux-gnu/bits/string3.h" - "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" - "/usr/include/x86_64-linux-gnu/bits/time.h" - "/usr/include/x86_64-linux-gnu/bits/timex.h" - "/usr/include/x86_64-linux-gnu/bits/types.h" - "/usr/include/x86_64-linux-gnu/bits/typesizes.h" - "/usr/include/x86_64-linux-gnu/bits/unistd.h" - "/usr/include/x86_64-linux-gnu/bits/waitflags.h" - "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" - "/usr/include/x86_64-linux-gnu/bits/wchar.h" - "/usr/include/x86_64-linux-gnu/bits/wchar2.h" - "/usr/include/x86_64-linux-gnu/bits/wordsize.h" - "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs.h" - "/usr/include/x86_64-linux-gnu/sys/cdefs.h" - "/usr/include/x86_64-linux-gnu/sys/select.h" - "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" - "/usr/include/x86_64-linux-gnu/sys/types.h" - "/usr/include/xlocale.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/float.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" - "/usr/local/cuda-6.5/include/CL/cl.h" - "/usr/local/cuda-6.5/include/CL/cl_ext.h" - "/usr/local/cuda-6.5/include/CL/cl_platform.h" - "/usr/local/cuda-6.5/include/builtin_types.h" - "/usr/local/cuda-6.5/include/channel_descriptor.h" - "/usr/local/cuda-6.5/include/common_functions.h" - "/usr/local/cuda-6.5/include/cuComplex.h" - "/usr/local/cuda-6.5/include/cublas_api.h" - "/usr/local/cuda-6.5/include/cublas_v2.h" - "/usr/local/cuda-6.5/include/cuda.h" - "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_runtime.h" - "/usr/local/cuda-6.5/include/cuda_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_surface_types.h" - "/usr/local/cuda-6.5/include/cuda_texture_types.h" - "/usr/local/cuda-6.5/include/curand.h" - "/usr/local/cuda-6.5/include/device_functions.h" - "/usr/local/cuda-6.5/include/device_launch_parameters.h" - "/usr/local/cuda-6.5/include/device_types.h" - "/usr/local/cuda-6.5/include/driver_functions.h" - "/usr/local/cuda-6.5/include/driver_types.h" - "/usr/local/cuda-6.5/include/host_config.h" - "/usr/local/cuda-6.5/include/host_defines.h" - "/usr/local/cuda-6.5/include/math_functions.h" - "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" - "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_13_double_functions.h" - "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" - "/usr/local/cuda-6.5/include/surface_functions.h" - "/usr/local/cuda-6.5/include/surface_indirect_functions.h" - "/usr/local/cuda-6.5/include/surface_types.h" - "/usr/local/cuda-6.5/include/texture_fetch_functions.h" - "/usr/local/cuda-6.5/include/texture_indirect_functions.h" - "/usr/local/cuda-6.5/include/texture_types.h" - "/usr/local/cuda-6.5/include/vector_functions.h" - "/usr/local/cuda-6.5/include/vector_types.h" - "/usr/local/include/boost/assert.hpp" - "/usr/local/include/boost/checked_delete.hpp" - "/usr/local/include/boost/config.hpp" - "/usr/local/include/boost/config/compiler/gcc.hpp" - "/usr/local/include/boost/config/compiler/nvcc.hpp" - "/usr/local/include/boost/config/no_tr1/memory.hpp" - "/usr/local/include/boost/config/no_tr1/utility.hpp" - "/usr/local/include/boost/config/platform/linux.hpp" - "/usr/local/include/boost/config/posix_features.hpp" - "/usr/local/include/boost/config/select_compiler_config.hpp" - "/usr/local/include/boost/config/select_platform_config.hpp" - "/usr/local/include/boost/config/select_stdlib_config.hpp" - "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" - "/usr/local/include/boost/config/suffix.hpp" - "/usr/local/include/boost/config/user.hpp" - "/usr/local/include/boost/core/checked_delete.hpp" - "/usr/local/include/boost/core/demangle.hpp" - "/usr/local/include/boost/core/typeinfo.hpp" - "/usr/local/include/boost/current_function.hpp" - "/usr/local/include/boost/detail/sp_typeinfo.hpp" - "/usr/local/include/boost/detail/workaround.hpp" - "/usr/local/include/boost/exception/exception.hpp" - "/usr/local/include/boost/predef.h" - "/usr/local/include/boost/predef/architecture.h" - "/usr/local/include/boost/predef/architecture/alpha.h" - "/usr/local/include/boost/predef/architecture/arm.h" - "/usr/local/include/boost/predef/architecture/blackfin.h" - "/usr/local/include/boost/predef/architecture/convex.h" - "/usr/local/include/boost/predef/architecture/ia64.h" - "/usr/local/include/boost/predef/architecture/m68k.h" - "/usr/local/include/boost/predef/architecture/mips.h" - "/usr/local/include/boost/predef/architecture/parisc.h" - "/usr/local/include/boost/predef/architecture/ppc.h" - "/usr/local/include/boost/predef/architecture/pyramid.h" - "/usr/local/include/boost/predef/architecture/rs6k.h" - "/usr/local/include/boost/predef/architecture/sparc.h" - "/usr/local/include/boost/predef/architecture/superh.h" - "/usr/local/include/boost/predef/architecture/sys370.h" - "/usr/local/include/boost/predef/architecture/sys390.h" - "/usr/local/include/boost/predef/architecture/x86.h" - "/usr/local/include/boost/predef/architecture/x86/32.h" - "/usr/local/include/boost/predef/architecture/x86/64.h" - "/usr/local/include/boost/predef/architecture/z.h" - "/usr/local/include/boost/predef/compiler.h" - "/usr/local/include/boost/predef/compiler/borland.h" - "/usr/local/include/boost/predef/compiler/clang.h" - "/usr/local/include/boost/predef/compiler/comeau.h" - "/usr/local/include/boost/predef/compiler/compaq.h" - "/usr/local/include/boost/predef/compiler/diab.h" - "/usr/local/include/boost/predef/compiler/digitalmars.h" - "/usr/local/include/boost/predef/compiler/dignus.h" - "/usr/local/include/boost/predef/compiler/edg.h" - "/usr/local/include/boost/predef/compiler/ekopath.h" - "/usr/local/include/boost/predef/compiler/gcc.h" - "/usr/local/include/boost/predef/compiler/gcc_xml.h" - "/usr/local/include/boost/predef/compiler/greenhills.h" - "/usr/local/include/boost/predef/compiler/hp_acc.h" - "/usr/local/include/boost/predef/compiler/iar.h" - "/usr/local/include/boost/predef/compiler/ibm.h" - "/usr/local/include/boost/predef/compiler/intel.h" - "/usr/local/include/boost/predef/compiler/kai.h" - "/usr/local/include/boost/predef/compiler/llvm.h" - "/usr/local/include/boost/predef/compiler/metaware.h" - "/usr/local/include/boost/predef/compiler/metrowerks.h" - "/usr/local/include/boost/predef/compiler/microtec.h" - "/usr/local/include/boost/predef/compiler/mpw.h" - "/usr/local/include/boost/predef/compiler/palm.h" - "/usr/local/include/boost/predef/compiler/pgi.h" - "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" - "/usr/local/include/boost/predef/compiler/sunpro.h" - "/usr/local/include/boost/predef/compiler/tendra.h" - "/usr/local/include/boost/predef/compiler/visualc.h" - "/usr/local/include/boost/predef/compiler/watcom.h" - "/usr/local/include/boost/predef/detail/_cassert.h" - "/usr/local/include/boost/predef/detail/_exception.h" - "/usr/local/include/boost/predef/detail/comp_detected.h" - "/usr/local/include/boost/predef/detail/os_detected.h" - "/usr/local/include/boost/predef/detail/test.h" - "/usr/local/include/boost/predef/language.h" - "/usr/local/include/boost/predef/language/objc.h" - "/usr/local/include/boost/predef/language/stdc.h" - "/usr/local/include/boost/predef/language/stdcpp.h" - "/usr/local/include/boost/predef/library.h" - "/usr/local/include/boost/predef/library/c.h" - "/usr/local/include/boost/predef/library/c/_prefix.h" - "/usr/local/include/boost/predef/library/c/gnu.h" - "/usr/local/include/boost/predef/library/c/uc.h" - "/usr/local/include/boost/predef/library/c/vms.h" - "/usr/local/include/boost/predef/library/c/zos.h" - "/usr/local/include/boost/predef/library/std.h" - "/usr/local/include/boost/predef/library/std/_prefix.h" - "/usr/local/include/boost/predef/library/std/cxx.h" - "/usr/local/include/boost/predef/library/std/dinkumware.h" - "/usr/local/include/boost/predef/library/std/libcomo.h" - "/usr/local/include/boost/predef/library/std/modena.h" - "/usr/local/include/boost/predef/library/std/msl.h" - "/usr/local/include/boost/predef/library/std/roguewave.h" - "/usr/local/include/boost/predef/library/std/sgi.h" - "/usr/local/include/boost/predef/library/std/stdcpp3.h" - "/usr/local/include/boost/predef/library/std/stlport.h" - "/usr/local/include/boost/predef/library/std/vacpp.h" - "/usr/local/include/boost/predef/make.h" - "/usr/local/include/boost/predef/os.h" - "/usr/local/include/boost/predef/os/aix.h" - "/usr/local/include/boost/predef/os/amigaos.h" - "/usr/local/include/boost/predef/os/android.h" - "/usr/local/include/boost/predef/os/beos.h" - "/usr/local/include/boost/predef/os/bsd.h" - "/usr/local/include/boost/predef/os/bsd/bsdi.h" - "/usr/local/include/boost/predef/os/bsd/dragonfly.h" - "/usr/local/include/boost/predef/os/bsd/free.h" - "/usr/local/include/boost/predef/os/bsd/net.h" - "/usr/local/include/boost/predef/os/bsd/open.h" - "/usr/local/include/boost/predef/os/cygwin.h" - "/usr/local/include/boost/predef/os/hpux.h" - "/usr/local/include/boost/predef/os/ios.h" - "/usr/local/include/boost/predef/os/irix.h" - "/usr/local/include/boost/predef/os/linux.h" - "/usr/local/include/boost/predef/os/macos.h" - "/usr/local/include/boost/predef/os/os400.h" - "/usr/local/include/boost/predef/os/qnxnto.h" - "/usr/local/include/boost/predef/os/solaris.h" - "/usr/local/include/boost/predef/os/unix.h" - "/usr/local/include/boost/predef/os/vms.h" - "/usr/local/include/boost/predef/os/windows.h" - "/usr/local/include/boost/predef/other.h" - "/usr/local/include/boost/predef/other/endian.h" - "/usr/local/include/boost/predef/platform.h" - "/usr/local/include/boost/predef/platform/mingw.h" - "/usr/local/include/boost/predef/platform/windows_desktop.h" - "/usr/local/include/boost/predef/platform/windows_phone.h" - "/usr/local/include/boost/predef/platform/windows_runtime.h" - "/usr/local/include/boost/predef/platform/windows_store.h" - "/usr/local/include/boost/predef/version_number.h" - "/usr/local/include/boost/scoped_ptr.hpp" - "/usr/local/include/boost/shared_ptr.hpp" - "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" - "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" - "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" - "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp" - "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" - "/usr/local/include/boost/throw_exception.hpp" - "/usr/local/include/gflags/gflags.h" - "/usr/local/include/gflags/gflags_declare.h" - "/usr/local/include/glog/log_severity.h" - "/usr/local/include/glog/logging.h" - "/usr/local/include/glog/vlog_is_on.h" -) - diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake deleted file mode 100644 index d7dfae88..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend deleted file mode 100644 index f9de6105..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend +++ /dev/null @@ -1,468 +0,0 @@ -# Generated by: make2cmake.cmake -SET(CUDA_NVCC_DEPEND - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu" - "/opt/clBLAS-private-april8/include/clBLAS-complex.h" - "/opt/clBLAS-private-april8/include/clBLAS.h" - "/usr/include/H5ACpublic.h" - "/usr/include/H5Apublic.h" - "/usr/include/H5Cpublic.h" - "/usr/include/H5Dpublic.h" - "/usr/include/H5Epubgen.h" - "/usr/include/H5Epublic.h" - "/usr/include/H5FDcore.h" - "/usr/include/H5FDdirect.h" - "/usr/include/H5FDfamily.h" - "/usr/include/H5FDlog.h" - "/usr/include/H5FDmpi.h" - "/usr/include/H5FDmpio.h" - "/usr/include/H5FDmpiposix.h" - "/usr/include/H5FDmulti.h" - "/usr/include/H5FDpublic.h" - "/usr/include/H5FDsec2.h" - "/usr/include/H5FDstdio.h" - "/usr/include/H5Fpublic.h" - "/usr/include/H5Gpublic.h" - "/usr/include/H5Ipublic.h" - "/usr/include/H5Lpublic.h" - "/usr/include/H5MMpublic.h" - "/usr/include/H5Opublic.h" - "/usr/include/H5Ppublic.h" - "/usr/include/H5Rpublic.h" - "/usr/include/H5Spublic.h" - "/usr/include/H5Tpublic.h" - "/usr/include/H5Zpublic.h" - "/usr/include/H5api_adpt.h" - "/usr/include/H5pubconf.h" - "/usr/include/H5public.h" - "/usr/include/H5version.h" - "/usr/include/_G_config.h" - "/usr/include/alloca.h" - "/usr/include/asm-generic/errno-base.h" - "/usr/include/asm-generic/errno.h" - "/usr/include/assert.h" - "/usr/include/atlas/cblas.h" - "/usr/include/c++/4.8/algorithm" - "/usr/include/c++/4.8/backward/auto_ptr.h" - "/usr/include/c++/4.8/backward/binders.h" - "/usr/include/c++/4.8/bits/algorithmfwd.h" - "/usr/include/c++/4.8/bits/allocator.h" - "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" - "/usr/include/c++/4.8/bits/basic_ios.h" - "/usr/include/c++/4.8/bits/basic_ios.tcc" - "/usr/include/c++/4.8/bits/basic_string.h" - "/usr/include/c++/4.8/bits/basic_string.tcc" - "/usr/include/c++/4.8/bits/char_traits.h" - "/usr/include/c++/4.8/bits/codecvt.h" - "/usr/include/c++/4.8/bits/concept_check.h" - "/usr/include/c++/4.8/bits/cpp_type_traits.h" - "/usr/include/c++/4.8/bits/cxxabi_forced.h" - "/usr/include/c++/4.8/bits/exception_defines.h" - "/usr/include/c++/4.8/bits/fstream.tcc" - "/usr/include/c++/4.8/bits/functexcept.h" - "/usr/include/c++/4.8/bits/ios_base.h" - "/usr/include/c++/4.8/bits/istream.tcc" - "/usr/include/c++/4.8/bits/locale_classes.h" - "/usr/include/c++/4.8/bits/locale_classes.tcc" - "/usr/include/c++/4.8/bits/locale_facets.h" - "/usr/include/c++/4.8/bits/locale_facets.tcc" - "/usr/include/c++/4.8/bits/localefwd.h" - "/usr/include/c++/4.8/bits/memoryfwd.h" - "/usr/include/c++/4.8/bits/move.h" - "/usr/include/c++/4.8/bits/ostream.tcc" - "/usr/include/c++/4.8/bits/ostream_insert.h" - "/usr/include/c++/4.8/bits/postypes.h" - "/usr/include/c++/4.8/bits/range_access.h" - "/usr/include/c++/4.8/bits/sstream.tcc" - "/usr/include/c++/4.8/bits/stl_algo.h" - "/usr/include/c++/4.8/bits/stl_algobase.h" - "/usr/include/c++/4.8/bits/stl_bvector.h" - "/usr/include/c++/4.8/bits/stl_construct.h" - "/usr/include/c++/4.8/bits/stl_function.h" - "/usr/include/c++/4.8/bits/stl_heap.h" - "/usr/include/c++/4.8/bits/stl_iterator.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" - "/usr/include/c++/4.8/bits/stl_map.h" - "/usr/include/c++/4.8/bits/stl_multimap.h" - "/usr/include/c++/4.8/bits/stl_multiset.h" - "/usr/include/c++/4.8/bits/stl_pair.h" - "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" - "/usr/include/c++/4.8/bits/stl_relops.h" - "/usr/include/c++/4.8/bits/stl_set.h" - "/usr/include/c++/4.8/bits/stl_tempbuf.h" - "/usr/include/c++/4.8/bits/stl_tree.h" - "/usr/include/c++/4.8/bits/stl_uninitialized.h" - "/usr/include/c++/4.8/bits/stl_vector.h" - "/usr/include/c++/4.8/bits/stream_iterator.h" - "/usr/include/c++/4.8/bits/streambuf.tcc" - "/usr/include/c++/4.8/bits/streambuf_iterator.h" - "/usr/include/c++/4.8/bits/stringfwd.h" - "/usr/include/c++/4.8/bits/vector.tcc" - "/usr/include/c++/4.8/cctype" - "/usr/include/c++/4.8/climits" - "/usr/include/c++/4.8/clocale" - "/usr/include/c++/4.8/cmath" - "/usr/include/c++/4.8/cstddef" - "/usr/include/c++/4.8/cstdio" - "/usr/include/c++/4.8/cstdlib" - "/usr/include/c++/4.8/cwchar" - "/usr/include/c++/4.8/cwctype" - "/usr/include/c++/4.8/cxxabi.h" - "/usr/include/c++/4.8/debug/debug.h" - "/usr/include/c++/4.8/exception" - "/usr/include/c++/4.8/ext/alloc_traits.h" - "/usr/include/c++/4.8/ext/atomicity.h" - "/usr/include/c++/4.8/ext/new_allocator.h" - "/usr/include/c++/4.8/ext/numeric_traits.h" - "/usr/include/c++/4.8/ext/type_traits.h" - "/usr/include/c++/4.8/fstream" - "/usr/include/c++/4.8/functional" - "/usr/include/c++/4.8/ios" - "/usr/include/c++/4.8/iosfwd" - "/usr/include/c++/4.8/iostream" - "/usr/include/c++/4.8/istream" - "/usr/include/c++/4.8/iterator" - "/usr/include/c++/4.8/map" - "/usr/include/c++/4.8/memory" - "/usr/include/c++/4.8/new" - "/usr/include/c++/4.8/ostream" - "/usr/include/c++/4.8/set" - "/usr/include/c++/4.8/sstream" - "/usr/include/c++/4.8/streambuf" - "/usr/include/c++/4.8/string" - "/usr/include/c++/4.8/typeinfo" - "/usr/include/c++/4.8/utility" - "/usr/include/c++/4.8/vector" - "/usr/include/ctype.h" - "/usr/include/endian.h" - "/usr/include/errno.h" - "/usr/include/features.h" - "/usr/include/getopt.h" - "/usr/include/google/protobuf/descriptor.h" - "/usr/include/google/protobuf/extension_set.h" - "/usr/include/google/protobuf/generated_enum_reflection.h" - "/usr/include/google/protobuf/generated_message_util.h" - "/usr/include/google/protobuf/message.h" - "/usr/include/google/protobuf/message_lite.h" - "/usr/include/google/protobuf/repeated_field.h" - "/usr/include/google/protobuf/stubs/common.h" - "/usr/include/google/protobuf/stubs/template_util.h" - "/usr/include/google/protobuf/stubs/type_traits.h" - "/usr/include/google/protobuf/unknown_field_set.h" - "/usr/include/hdf5.h" - "/usr/include/inttypes.h" - "/usr/include/libio.h" - "/usr/include/limits.h" - "/usr/include/linux/errno.h" - "/usr/include/linux/limits.h" - "/usr/include/locale.h" - "/usr/include/math.h" - "/usr/include/pthread.h" - "/usr/include/sched.h" - "/usr/include/stdc-predef.h" - "/usr/include/stdint.h" - "/usr/include/stdio.h" - "/usr/include/stdlib.h" - "/usr/include/string.h" - "/usr/include/time.h" - "/usr/include/unistd.h" - "/usr/include/wchar.h" - "/usr/include/wctype.h" - "/usr/include/x86_64-linux-gnu/asm/errno.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap.h" - "/usr/include/x86_64-linux-gnu/bits/confname.h" - "/usr/include/x86_64-linux-gnu/bits/endian.h" - "/usr/include/x86_64-linux-gnu/bits/environments.h" - "/usr/include/x86_64-linux-gnu/bits/errno.h" - "/usr/include/x86_64-linux-gnu/bits/huge_val.h" - "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" - "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" - "/usr/include/x86_64-linux-gnu/bits/inf.h" - "/usr/include/x86_64-linux-gnu/bits/local_lim.h" - "/usr/include/x86_64-linux-gnu/bits/locale.h" - "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" - "/usr/include/x86_64-linux-gnu/bits/mathdef.h" - "/usr/include/x86_64-linux-gnu/bits/mathinline.h" - "/usr/include/x86_64-linux-gnu/bits/nan.h" - "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" - "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" - "/usr/include/x86_64-linux-gnu/bits/sched.h" - "/usr/include/x86_64-linux-gnu/bits/select.h" - "/usr/include/x86_64-linux-gnu/bits/select2.h" - "/usr/include/x86_64-linux-gnu/bits/setjmp.h" - "/usr/include/x86_64-linux-gnu/bits/sigset.h" - "/usr/include/x86_64-linux-gnu/bits/stdio.h" - "/usr/include/x86_64-linux-gnu/bits/stdio2.h" - "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib.h" - "/usr/include/x86_64-linux-gnu/bits/string3.h" - "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" - "/usr/include/x86_64-linux-gnu/bits/time.h" - "/usr/include/x86_64-linux-gnu/bits/timex.h" - "/usr/include/x86_64-linux-gnu/bits/types.h" - "/usr/include/x86_64-linux-gnu/bits/typesizes.h" - "/usr/include/x86_64-linux-gnu/bits/unistd.h" - "/usr/include/x86_64-linux-gnu/bits/waitflags.h" - "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" - "/usr/include/x86_64-linux-gnu/bits/wchar.h" - "/usr/include/x86_64-linux-gnu/bits/wchar2.h" - "/usr/include/x86_64-linux-gnu/bits/wordsize.h" - "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs.h" - "/usr/include/x86_64-linux-gnu/sys/cdefs.h" - "/usr/include/x86_64-linux-gnu/sys/select.h" - "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" - "/usr/include/x86_64-linux-gnu/sys/types.h" - "/usr/include/xlocale.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" - "/usr/local/cuda-6.5/include/CL/cl.h" - "/usr/local/cuda-6.5/include/CL/cl_ext.h" - "/usr/local/cuda-6.5/include/CL/cl_platform.h" - "/usr/local/cuda-6.5/include/builtin_types.h" - "/usr/local/cuda-6.5/include/channel_descriptor.h" - "/usr/local/cuda-6.5/include/common_functions.h" - "/usr/local/cuda-6.5/include/cuComplex.h" - "/usr/local/cuda-6.5/include/cublas_api.h" - "/usr/local/cuda-6.5/include/cublas_v2.h" - "/usr/local/cuda-6.5/include/cuda.h" - "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_runtime.h" - "/usr/local/cuda-6.5/include/cuda_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_surface_types.h" - "/usr/local/cuda-6.5/include/cuda_texture_types.h" - "/usr/local/cuda-6.5/include/curand.h" - "/usr/local/cuda-6.5/include/device_functions.h" - "/usr/local/cuda-6.5/include/device_launch_parameters.h" - "/usr/local/cuda-6.5/include/device_types.h" - "/usr/local/cuda-6.5/include/driver_functions.h" - "/usr/local/cuda-6.5/include/driver_types.h" - "/usr/local/cuda-6.5/include/host_config.h" - "/usr/local/cuda-6.5/include/host_defines.h" - "/usr/local/cuda-6.5/include/math_functions.h" - "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" - "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_13_double_functions.h" - "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" - "/usr/local/cuda-6.5/include/surface_functions.h" - "/usr/local/cuda-6.5/include/surface_indirect_functions.h" - "/usr/local/cuda-6.5/include/surface_types.h" - "/usr/local/cuda-6.5/include/texture_fetch_functions.h" - "/usr/local/cuda-6.5/include/texture_indirect_functions.h" - "/usr/local/cuda-6.5/include/texture_types.h" - "/usr/local/cuda-6.5/include/vector_functions.h" - "/usr/local/cuda-6.5/include/vector_types.h" - "/usr/local/include/boost/assert.hpp" - "/usr/local/include/boost/checked_delete.hpp" - "/usr/local/include/boost/config.hpp" - "/usr/local/include/boost/config/compiler/gcc.hpp" - "/usr/local/include/boost/config/compiler/nvcc.hpp" - "/usr/local/include/boost/config/no_tr1/memory.hpp" - "/usr/local/include/boost/config/no_tr1/utility.hpp" - "/usr/local/include/boost/config/platform/linux.hpp" - "/usr/local/include/boost/config/posix_features.hpp" - "/usr/local/include/boost/config/select_compiler_config.hpp" - "/usr/local/include/boost/config/select_platform_config.hpp" - "/usr/local/include/boost/config/select_stdlib_config.hpp" - "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" - "/usr/local/include/boost/config/suffix.hpp" - "/usr/local/include/boost/config/user.hpp" - "/usr/local/include/boost/core/checked_delete.hpp" - "/usr/local/include/boost/core/demangle.hpp" - "/usr/local/include/boost/core/typeinfo.hpp" - "/usr/local/include/boost/current_function.hpp" - "/usr/local/include/boost/detail/sp_typeinfo.hpp" - "/usr/local/include/boost/detail/workaround.hpp" - "/usr/local/include/boost/exception/exception.hpp" - "/usr/local/include/boost/predef.h" - "/usr/local/include/boost/predef/architecture.h" - "/usr/local/include/boost/predef/architecture/alpha.h" - "/usr/local/include/boost/predef/architecture/arm.h" - "/usr/local/include/boost/predef/architecture/blackfin.h" - "/usr/local/include/boost/predef/architecture/convex.h" - "/usr/local/include/boost/predef/architecture/ia64.h" - "/usr/local/include/boost/predef/architecture/m68k.h" - "/usr/local/include/boost/predef/architecture/mips.h" - "/usr/local/include/boost/predef/architecture/parisc.h" - "/usr/local/include/boost/predef/architecture/ppc.h" - "/usr/local/include/boost/predef/architecture/pyramid.h" - "/usr/local/include/boost/predef/architecture/rs6k.h" - "/usr/local/include/boost/predef/architecture/sparc.h" - "/usr/local/include/boost/predef/architecture/superh.h" - "/usr/local/include/boost/predef/architecture/sys370.h" - "/usr/local/include/boost/predef/architecture/sys390.h" - "/usr/local/include/boost/predef/architecture/x86.h" - "/usr/local/include/boost/predef/architecture/x86/32.h" - "/usr/local/include/boost/predef/architecture/x86/64.h" - "/usr/local/include/boost/predef/architecture/z.h" - "/usr/local/include/boost/predef/compiler.h" - "/usr/local/include/boost/predef/compiler/borland.h" - "/usr/local/include/boost/predef/compiler/clang.h" - "/usr/local/include/boost/predef/compiler/comeau.h" - "/usr/local/include/boost/predef/compiler/compaq.h" - "/usr/local/include/boost/predef/compiler/diab.h" - "/usr/local/include/boost/predef/compiler/digitalmars.h" - "/usr/local/include/boost/predef/compiler/dignus.h" - "/usr/local/include/boost/predef/compiler/edg.h" - "/usr/local/include/boost/predef/compiler/ekopath.h" - "/usr/local/include/boost/predef/compiler/gcc.h" - "/usr/local/include/boost/predef/compiler/gcc_xml.h" - "/usr/local/include/boost/predef/compiler/greenhills.h" - "/usr/local/include/boost/predef/compiler/hp_acc.h" - "/usr/local/include/boost/predef/compiler/iar.h" - "/usr/local/include/boost/predef/compiler/ibm.h" - "/usr/local/include/boost/predef/compiler/intel.h" - "/usr/local/include/boost/predef/compiler/kai.h" - "/usr/local/include/boost/predef/compiler/llvm.h" - "/usr/local/include/boost/predef/compiler/metaware.h" - "/usr/local/include/boost/predef/compiler/metrowerks.h" - "/usr/local/include/boost/predef/compiler/microtec.h" - "/usr/local/include/boost/predef/compiler/mpw.h" - "/usr/local/include/boost/predef/compiler/palm.h" - "/usr/local/include/boost/predef/compiler/pgi.h" - "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" - "/usr/local/include/boost/predef/compiler/sunpro.h" - "/usr/local/include/boost/predef/compiler/tendra.h" - "/usr/local/include/boost/predef/compiler/visualc.h" - "/usr/local/include/boost/predef/compiler/watcom.h" - "/usr/local/include/boost/predef/detail/_cassert.h" - "/usr/local/include/boost/predef/detail/_exception.h" - "/usr/local/include/boost/predef/detail/comp_detected.h" - "/usr/local/include/boost/predef/detail/os_detected.h" - "/usr/local/include/boost/predef/detail/test.h" - "/usr/local/include/boost/predef/language.h" - "/usr/local/include/boost/predef/language/objc.h" - "/usr/local/include/boost/predef/language/stdc.h" - "/usr/local/include/boost/predef/language/stdcpp.h" - "/usr/local/include/boost/predef/library.h" - "/usr/local/include/boost/predef/library/c.h" - "/usr/local/include/boost/predef/library/c/_prefix.h" - "/usr/local/include/boost/predef/library/c/gnu.h" - "/usr/local/include/boost/predef/library/c/uc.h" - "/usr/local/include/boost/predef/library/c/vms.h" - "/usr/local/include/boost/predef/library/c/zos.h" - "/usr/local/include/boost/predef/library/std.h" - "/usr/local/include/boost/predef/library/std/_prefix.h" - "/usr/local/include/boost/predef/library/std/cxx.h" - "/usr/local/include/boost/predef/library/std/dinkumware.h" - "/usr/local/include/boost/predef/library/std/libcomo.h" - "/usr/local/include/boost/predef/library/std/modena.h" - "/usr/local/include/boost/predef/library/std/msl.h" - "/usr/local/include/boost/predef/library/std/roguewave.h" - "/usr/local/include/boost/predef/library/std/sgi.h" - "/usr/local/include/boost/predef/library/std/stdcpp3.h" - "/usr/local/include/boost/predef/library/std/stlport.h" - "/usr/local/include/boost/predef/library/std/vacpp.h" - "/usr/local/include/boost/predef/make.h" - "/usr/local/include/boost/predef/os.h" - "/usr/local/include/boost/predef/os/aix.h" - "/usr/local/include/boost/predef/os/amigaos.h" - "/usr/local/include/boost/predef/os/android.h" - "/usr/local/include/boost/predef/os/beos.h" - "/usr/local/include/boost/predef/os/bsd.h" - "/usr/local/include/boost/predef/os/bsd/bsdi.h" - "/usr/local/include/boost/predef/os/bsd/dragonfly.h" - "/usr/local/include/boost/predef/os/bsd/free.h" - "/usr/local/include/boost/predef/os/bsd/net.h" - "/usr/local/include/boost/predef/os/bsd/open.h" - "/usr/local/include/boost/predef/os/cygwin.h" - "/usr/local/include/boost/predef/os/hpux.h" - "/usr/local/include/boost/predef/os/ios.h" - "/usr/local/include/boost/predef/os/irix.h" - "/usr/local/include/boost/predef/os/linux.h" - "/usr/local/include/boost/predef/os/macos.h" - "/usr/local/include/boost/predef/os/os400.h" - "/usr/local/include/boost/predef/os/qnxnto.h" - "/usr/local/include/boost/predef/os/solaris.h" - "/usr/local/include/boost/predef/os/unix.h" - "/usr/local/include/boost/predef/os/vms.h" - "/usr/local/include/boost/predef/os/windows.h" - "/usr/local/include/boost/predef/other.h" - "/usr/local/include/boost/predef/other/endian.h" - "/usr/local/include/boost/predef/platform.h" - "/usr/local/include/boost/predef/platform/mingw.h" - "/usr/local/include/boost/predef/platform/windows_desktop.h" - "/usr/local/include/boost/predef/platform/windows_phone.h" - "/usr/local/include/boost/predef/platform/windows_runtime.h" - "/usr/local/include/boost/predef/platform/windows_store.h" - "/usr/local/include/boost/predef/version_number.h" - "/usr/local/include/boost/scoped_ptr.hpp" - "/usr/local/include/boost/shared_ptr.hpp" - "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" - "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" - "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" - "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp" - "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" - "/usr/local/include/boost/throw_exception.hpp" - "/usr/local/include/gflags/gflags.h" - "/usr/local/include/gflags/gflags_declare.h" - "/usr/local/include/glog/log_severity.h" - "/usr/local/include/glog/logging.h" - "/usr/local/include/glog/vlog_is_on.h" -) - diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake deleted file mode 100644 index dd2453ae..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/silence_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake deleted file mode 100644 index 990e0622..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/slice_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake deleted file mode 100644 index ebf29ea2..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake deleted file mode 100644 index 6260b6e0..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_loss_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake deleted file mode 100644 index ad49afe7..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/split_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake deleted file mode 100644 index 71fc8fdb..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/tanh_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake deleted file mode 100644 index 4e18059a..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/threshold_layer.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake deleted file mode 100644 index 8de5e27c..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend deleted file mode 100644 index 36db02fe..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend +++ /dev/null @@ -1,404 +0,0 @@ -# Generated by: make2cmake.cmake -SET(CUDA_NVCC_DEPEND - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu" - "/opt/clBLAS-private-april8/include/clBLAS-complex.h" - "/opt/clBLAS-private-april8/include/clBLAS.h" - "/usr/include/_G_config.h" - "/usr/include/alloca.h" - "/usr/include/asm-generic/errno-base.h" - "/usr/include/asm-generic/errno.h" - "/usr/include/assert.h" - "/usr/include/c++/4.8/algorithm" - "/usr/include/c++/4.8/backward/auto_ptr.h" - "/usr/include/c++/4.8/backward/binders.h" - "/usr/include/c++/4.8/bits/algorithmfwd.h" - "/usr/include/c++/4.8/bits/allocator.h" - "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" - "/usr/include/c++/4.8/bits/basic_ios.h" - "/usr/include/c++/4.8/bits/basic_ios.tcc" - "/usr/include/c++/4.8/bits/basic_string.h" - "/usr/include/c++/4.8/bits/basic_string.tcc" - "/usr/include/c++/4.8/bits/char_traits.h" - "/usr/include/c++/4.8/bits/codecvt.h" - "/usr/include/c++/4.8/bits/concept_check.h" - "/usr/include/c++/4.8/bits/cpp_type_traits.h" - "/usr/include/c++/4.8/bits/cxxabi_forced.h" - "/usr/include/c++/4.8/bits/exception_defines.h" - "/usr/include/c++/4.8/bits/fstream.tcc" - "/usr/include/c++/4.8/bits/functexcept.h" - "/usr/include/c++/4.8/bits/ios_base.h" - "/usr/include/c++/4.8/bits/istream.tcc" - "/usr/include/c++/4.8/bits/locale_classes.h" - "/usr/include/c++/4.8/bits/locale_classes.tcc" - "/usr/include/c++/4.8/bits/locale_facets.h" - "/usr/include/c++/4.8/bits/locale_facets.tcc" - "/usr/include/c++/4.8/bits/localefwd.h" - "/usr/include/c++/4.8/bits/memoryfwd.h" - "/usr/include/c++/4.8/bits/move.h" - "/usr/include/c++/4.8/bits/ostream.tcc" - "/usr/include/c++/4.8/bits/ostream_insert.h" - "/usr/include/c++/4.8/bits/postypes.h" - "/usr/include/c++/4.8/bits/range_access.h" - "/usr/include/c++/4.8/bits/sstream.tcc" - "/usr/include/c++/4.8/bits/stl_algo.h" - "/usr/include/c++/4.8/bits/stl_algobase.h" - "/usr/include/c++/4.8/bits/stl_bvector.h" - "/usr/include/c++/4.8/bits/stl_construct.h" - "/usr/include/c++/4.8/bits/stl_function.h" - "/usr/include/c++/4.8/bits/stl_heap.h" - "/usr/include/c++/4.8/bits/stl_iterator.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" - "/usr/include/c++/4.8/bits/stl_map.h" - "/usr/include/c++/4.8/bits/stl_multimap.h" - "/usr/include/c++/4.8/bits/stl_multiset.h" - "/usr/include/c++/4.8/bits/stl_pair.h" - "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" - "/usr/include/c++/4.8/bits/stl_relops.h" - "/usr/include/c++/4.8/bits/stl_set.h" - "/usr/include/c++/4.8/bits/stl_tempbuf.h" - "/usr/include/c++/4.8/bits/stl_tree.h" - "/usr/include/c++/4.8/bits/stl_uninitialized.h" - "/usr/include/c++/4.8/bits/stl_vector.h" - "/usr/include/c++/4.8/bits/streambuf.tcc" - "/usr/include/c++/4.8/bits/streambuf_iterator.h" - "/usr/include/c++/4.8/bits/stringfwd.h" - "/usr/include/c++/4.8/bits/vector.tcc" - "/usr/include/c++/4.8/cctype" - "/usr/include/c++/4.8/climits" - "/usr/include/c++/4.8/clocale" - "/usr/include/c++/4.8/cmath" - "/usr/include/c++/4.8/cstddef" - "/usr/include/c++/4.8/cstdio" - "/usr/include/c++/4.8/cstdlib" - "/usr/include/c++/4.8/cstring" - "/usr/include/c++/4.8/cwchar" - "/usr/include/c++/4.8/cwctype" - "/usr/include/c++/4.8/cxxabi.h" - "/usr/include/c++/4.8/debug/debug.h" - "/usr/include/c++/4.8/exception" - "/usr/include/c++/4.8/ext/alloc_traits.h" - "/usr/include/c++/4.8/ext/atomicity.h" - "/usr/include/c++/4.8/ext/new_allocator.h" - "/usr/include/c++/4.8/ext/numeric_traits.h" - "/usr/include/c++/4.8/ext/type_traits.h" - "/usr/include/c++/4.8/fstream" - "/usr/include/c++/4.8/functional" - "/usr/include/c++/4.8/ios" - "/usr/include/c++/4.8/iosfwd" - "/usr/include/c++/4.8/iostream" - "/usr/include/c++/4.8/istream" - "/usr/include/c++/4.8/map" - "/usr/include/c++/4.8/memory" - "/usr/include/c++/4.8/new" - "/usr/include/c++/4.8/ostream" - "/usr/include/c++/4.8/set" - "/usr/include/c++/4.8/sstream" - "/usr/include/c++/4.8/streambuf" - "/usr/include/c++/4.8/string" - "/usr/include/c++/4.8/typeinfo" - "/usr/include/c++/4.8/utility" - "/usr/include/c++/4.8/vector" - "/usr/include/ctype.h" - "/usr/include/endian.h" - "/usr/include/errno.h" - "/usr/include/features.h" - "/usr/include/getopt.h" - "/usr/include/inttypes.h" - "/usr/include/libio.h" - "/usr/include/limits.h" - "/usr/include/linux/errno.h" - "/usr/include/linux/limits.h" - "/usr/include/locale.h" - "/usr/include/math.h" - "/usr/include/pthread.h" - "/usr/include/sched.h" - "/usr/include/stdc-predef.h" - "/usr/include/stdint.h" - "/usr/include/stdio.h" - "/usr/include/stdlib.h" - "/usr/include/string.h" - "/usr/include/time.h" - "/usr/include/unistd.h" - "/usr/include/wchar.h" - "/usr/include/wctype.h" - "/usr/include/x86_64-linux-gnu/asm/errno.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap.h" - "/usr/include/x86_64-linux-gnu/bits/confname.h" - "/usr/include/x86_64-linux-gnu/bits/endian.h" - "/usr/include/x86_64-linux-gnu/bits/environments.h" - "/usr/include/x86_64-linux-gnu/bits/errno.h" - "/usr/include/x86_64-linux-gnu/bits/huge_val.h" - "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" - "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" - "/usr/include/x86_64-linux-gnu/bits/inf.h" - "/usr/include/x86_64-linux-gnu/bits/local_lim.h" - "/usr/include/x86_64-linux-gnu/bits/locale.h" - "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" - "/usr/include/x86_64-linux-gnu/bits/mathdef.h" - "/usr/include/x86_64-linux-gnu/bits/mathinline.h" - "/usr/include/x86_64-linux-gnu/bits/nan.h" - "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" - "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" - "/usr/include/x86_64-linux-gnu/bits/sched.h" - "/usr/include/x86_64-linux-gnu/bits/select.h" - "/usr/include/x86_64-linux-gnu/bits/select2.h" - "/usr/include/x86_64-linux-gnu/bits/setjmp.h" - "/usr/include/x86_64-linux-gnu/bits/sigset.h" - "/usr/include/x86_64-linux-gnu/bits/stdio.h" - "/usr/include/x86_64-linux-gnu/bits/stdio2.h" - "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib.h" - "/usr/include/x86_64-linux-gnu/bits/string3.h" - "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" - "/usr/include/x86_64-linux-gnu/bits/time.h" - "/usr/include/x86_64-linux-gnu/bits/timex.h" - "/usr/include/x86_64-linux-gnu/bits/types.h" - "/usr/include/x86_64-linux-gnu/bits/typesizes.h" - "/usr/include/x86_64-linux-gnu/bits/unistd.h" - "/usr/include/x86_64-linux-gnu/bits/waitflags.h" - "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" - "/usr/include/x86_64-linux-gnu/bits/wchar.h" - "/usr/include/x86_64-linux-gnu/bits/wchar2.h" - "/usr/include/x86_64-linux-gnu/bits/wordsize.h" - "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs.h" - "/usr/include/x86_64-linux-gnu/sys/cdefs.h" - "/usr/include/x86_64-linux-gnu/sys/select.h" - "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" - "/usr/include/x86_64-linux-gnu/sys/types.h" - "/usr/include/xlocale.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" - "/usr/local/cuda-6.5/include/CL/cl.h" - "/usr/local/cuda-6.5/include/CL/cl_ext.h" - "/usr/local/cuda-6.5/include/CL/cl_platform.h" - "/usr/local/cuda-6.5/include/builtin_types.h" - "/usr/local/cuda-6.5/include/channel_descriptor.h" - "/usr/local/cuda-6.5/include/common_functions.h" - "/usr/local/cuda-6.5/include/cuComplex.h" - "/usr/local/cuda-6.5/include/cublas_api.h" - "/usr/local/cuda-6.5/include/cublas_v2.h" - "/usr/local/cuda-6.5/include/cuda.h" - "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_runtime.h" - "/usr/local/cuda-6.5/include/cuda_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_surface_types.h" - "/usr/local/cuda-6.5/include/cuda_texture_types.h" - "/usr/local/cuda-6.5/include/curand.h" - "/usr/local/cuda-6.5/include/device_functions.h" - "/usr/local/cuda-6.5/include/device_launch_parameters.h" - "/usr/local/cuda-6.5/include/device_types.h" - "/usr/local/cuda-6.5/include/driver_functions.h" - "/usr/local/cuda-6.5/include/driver_types.h" - "/usr/local/cuda-6.5/include/host_config.h" - "/usr/local/cuda-6.5/include/host_defines.h" - "/usr/local/cuda-6.5/include/math_functions.h" - "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" - "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_13_double_functions.h" - "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" - "/usr/local/cuda-6.5/include/surface_functions.h" - "/usr/local/cuda-6.5/include/surface_indirect_functions.h" - "/usr/local/cuda-6.5/include/surface_types.h" - "/usr/local/cuda-6.5/include/texture_fetch_functions.h" - "/usr/local/cuda-6.5/include/texture_indirect_functions.h" - "/usr/local/cuda-6.5/include/texture_types.h" - "/usr/local/cuda-6.5/include/vector_functions.h" - "/usr/local/cuda-6.5/include/vector_types.h" - "/usr/local/include/boost/assert.hpp" - "/usr/local/include/boost/checked_delete.hpp" - "/usr/local/include/boost/config.hpp" - "/usr/local/include/boost/config/compiler/gcc.hpp" - "/usr/local/include/boost/config/compiler/nvcc.hpp" - "/usr/local/include/boost/config/no_tr1/memory.hpp" - "/usr/local/include/boost/config/no_tr1/utility.hpp" - "/usr/local/include/boost/config/platform/linux.hpp" - "/usr/local/include/boost/config/posix_features.hpp" - "/usr/local/include/boost/config/select_compiler_config.hpp" - "/usr/local/include/boost/config/select_platform_config.hpp" - "/usr/local/include/boost/config/select_stdlib_config.hpp" - "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" - "/usr/local/include/boost/config/suffix.hpp" - "/usr/local/include/boost/config/user.hpp" - "/usr/local/include/boost/core/checked_delete.hpp" - "/usr/local/include/boost/core/demangle.hpp" - "/usr/local/include/boost/core/typeinfo.hpp" - "/usr/local/include/boost/current_function.hpp" - "/usr/local/include/boost/detail/sp_typeinfo.hpp" - "/usr/local/include/boost/detail/workaround.hpp" - "/usr/local/include/boost/exception/exception.hpp" - "/usr/local/include/boost/predef.h" - "/usr/local/include/boost/predef/architecture.h" - "/usr/local/include/boost/predef/architecture/alpha.h" - "/usr/local/include/boost/predef/architecture/arm.h" - "/usr/local/include/boost/predef/architecture/blackfin.h" - "/usr/local/include/boost/predef/architecture/convex.h" - "/usr/local/include/boost/predef/architecture/ia64.h" - "/usr/local/include/boost/predef/architecture/m68k.h" - "/usr/local/include/boost/predef/architecture/mips.h" - "/usr/local/include/boost/predef/architecture/parisc.h" - "/usr/local/include/boost/predef/architecture/ppc.h" - "/usr/local/include/boost/predef/architecture/pyramid.h" - "/usr/local/include/boost/predef/architecture/rs6k.h" - "/usr/local/include/boost/predef/architecture/sparc.h" - "/usr/local/include/boost/predef/architecture/superh.h" - "/usr/local/include/boost/predef/architecture/sys370.h" - "/usr/local/include/boost/predef/architecture/sys390.h" - "/usr/local/include/boost/predef/architecture/x86.h" - "/usr/local/include/boost/predef/architecture/x86/32.h" - "/usr/local/include/boost/predef/architecture/x86/64.h" - "/usr/local/include/boost/predef/architecture/z.h" - "/usr/local/include/boost/predef/compiler.h" - "/usr/local/include/boost/predef/compiler/borland.h" - "/usr/local/include/boost/predef/compiler/clang.h" - "/usr/local/include/boost/predef/compiler/comeau.h" - "/usr/local/include/boost/predef/compiler/compaq.h" - "/usr/local/include/boost/predef/compiler/diab.h" - "/usr/local/include/boost/predef/compiler/digitalmars.h" - "/usr/local/include/boost/predef/compiler/dignus.h" - "/usr/local/include/boost/predef/compiler/edg.h" - "/usr/local/include/boost/predef/compiler/ekopath.h" - "/usr/local/include/boost/predef/compiler/gcc.h" - "/usr/local/include/boost/predef/compiler/gcc_xml.h" - "/usr/local/include/boost/predef/compiler/greenhills.h" - "/usr/local/include/boost/predef/compiler/hp_acc.h" - "/usr/local/include/boost/predef/compiler/iar.h" - "/usr/local/include/boost/predef/compiler/ibm.h" - "/usr/local/include/boost/predef/compiler/intel.h" - "/usr/local/include/boost/predef/compiler/kai.h" - "/usr/local/include/boost/predef/compiler/llvm.h" - "/usr/local/include/boost/predef/compiler/metaware.h" - "/usr/local/include/boost/predef/compiler/metrowerks.h" - "/usr/local/include/boost/predef/compiler/microtec.h" - "/usr/local/include/boost/predef/compiler/mpw.h" - "/usr/local/include/boost/predef/compiler/palm.h" - "/usr/local/include/boost/predef/compiler/pgi.h" - "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" - "/usr/local/include/boost/predef/compiler/sunpro.h" - "/usr/local/include/boost/predef/compiler/tendra.h" - "/usr/local/include/boost/predef/compiler/visualc.h" - "/usr/local/include/boost/predef/compiler/watcom.h" - "/usr/local/include/boost/predef/detail/_cassert.h" - "/usr/local/include/boost/predef/detail/_exception.h" - "/usr/local/include/boost/predef/detail/comp_detected.h" - "/usr/local/include/boost/predef/detail/os_detected.h" - "/usr/local/include/boost/predef/detail/test.h" - "/usr/local/include/boost/predef/language.h" - "/usr/local/include/boost/predef/language/objc.h" - "/usr/local/include/boost/predef/language/stdc.h" - "/usr/local/include/boost/predef/language/stdcpp.h" - "/usr/local/include/boost/predef/library.h" - "/usr/local/include/boost/predef/library/c.h" - "/usr/local/include/boost/predef/library/c/_prefix.h" - "/usr/local/include/boost/predef/library/c/gnu.h" - "/usr/local/include/boost/predef/library/c/uc.h" - "/usr/local/include/boost/predef/library/c/vms.h" - "/usr/local/include/boost/predef/library/c/zos.h" - "/usr/local/include/boost/predef/library/std.h" - "/usr/local/include/boost/predef/library/std/_prefix.h" - "/usr/local/include/boost/predef/library/std/cxx.h" - "/usr/local/include/boost/predef/library/std/dinkumware.h" - "/usr/local/include/boost/predef/library/std/libcomo.h" - "/usr/local/include/boost/predef/library/std/modena.h" - "/usr/local/include/boost/predef/library/std/msl.h" - "/usr/local/include/boost/predef/library/std/roguewave.h" - "/usr/local/include/boost/predef/library/std/sgi.h" - "/usr/local/include/boost/predef/library/std/stdcpp3.h" - "/usr/local/include/boost/predef/library/std/stlport.h" - "/usr/local/include/boost/predef/library/std/vacpp.h" - "/usr/local/include/boost/predef/make.h" - "/usr/local/include/boost/predef/os.h" - "/usr/local/include/boost/predef/os/aix.h" - "/usr/local/include/boost/predef/os/amigaos.h" - "/usr/local/include/boost/predef/os/android.h" - "/usr/local/include/boost/predef/os/beos.h" - "/usr/local/include/boost/predef/os/bsd.h" - "/usr/local/include/boost/predef/os/bsd/bsdi.h" - "/usr/local/include/boost/predef/os/bsd/dragonfly.h" - "/usr/local/include/boost/predef/os/bsd/free.h" - "/usr/local/include/boost/predef/os/bsd/net.h" - "/usr/local/include/boost/predef/os/bsd/open.h" - "/usr/local/include/boost/predef/os/cygwin.h" - "/usr/local/include/boost/predef/os/hpux.h" - "/usr/local/include/boost/predef/os/ios.h" - "/usr/local/include/boost/predef/os/irix.h" - "/usr/local/include/boost/predef/os/linux.h" - "/usr/local/include/boost/predef/os/macos.h" - "/usr/local/include/boost/predef/os/os400.h" - "/usr/local/include/boost/predef/os/qnxnto.h" - "/usr/local/include/boost/predef/os/solaris.h" - "/usr/local/include/boost/predef/os/unix.h" - "/usr/local/include/boost/predef/os/vms.h" - "/usr/local/include/boost/predef/os/windows.h" - "/usr/local/include/boost/predef/other.h" - "/usr/local/include/boost/predef/other/endian.h" - "/usr/local/include/boost/predef/platform.h" - "/usr/local/include/boost/predef/platform/mingw.h" - "/usr/local/include/boost/predef/platform/windows_desktop.h" - "/usr/local/include/boost/predef/platform/windows_phone.h" - "/usr/local/include/boost/predef/platform/windows_runtime.h" - "/usr/local/include/boost/predef/platform/windows_store.h" - "/usr/local/include/boost/predef/version_number.h" - "/usr/local/include/boost/shared_ptr.hpp" - "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" - "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" - "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" - "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" - "/usr/local/include/boost/throw_exception.hpp" - "/usr/local/include/gflags/gflags.h" - "/usr/local/include/gflags/gflags_declare.h" - "/usr/local/include/glog/log_severity.h" - "/usr/local/include/glog/logging.h" - "/usr/local/include/glog/vlog_is_on.h" -) - diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake deleted file mode 100644 index 0bd0d4e9..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend deleted file mode 100644 index 2dfb589a..00000000 --- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend +++ /dev/null @@ -1,744 +0,0 @@ -# Generated by: make2cmake.cmake -SET(CUDA_NVCC_DEPEND - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu" - "/opt/clBLAS-private-april8/include/clBLAS-complex.h" - "/opt/clBLAS-private-april8/include/clBLAS.h" - "/usr/include/_G_config.h" - "/usr/include/alloca.h" - "/usr/include/asm-generic/errno-base.h" - "/usr/include/asm-generic/errno.h" - "/usr/include/assert.h" - "/usr/include/atlas/cblas.h" - "/usr/include/c++/4.8/algorithm" - "/usr/include/c++/4.8/backward/auto_ptr.h" - "/usr/include/c++/4.8/backward/binders.h" - "/usr/include/c++/4.8/bits/algorithmfwd.h" - "/usr/include/c++/4.8/bits/allocator.h" - "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h" - "/usr/include/c++/4.8/bits/basic_ios.h" - "/usr/include/c++/4.8/bits/basic_ios.tcc" - "/usr/include/c++/4.8/bits/basic_string.h" - "/usr/include/c++/4.8/bits/basic_string.tcc" - "/usr/include/c++/4.8/bits/char_traits.h" - "/usr/include/c++/4.8/bits/codecvt.h" - "/usr/include/c++/4.8/bits/concept_check.h" - "/usr/include/c++/4.8/bits/cpp_type_traits.h" - "/usr/include/c++/4.8/bits/cxxabi_forced.h" - "/usr/include/c++/4.8/bits/exception_defines.h" - "/usr/include/c++/4.8/bits/fstream.tcc" - "/usr/include/c++/4.8/bits/functexcept.h" - "/usr/include/c++/4.8/bits/ios_base.h" - "/usr/include/c++/4.8/bits/istream.tcc" - "/usr/include/c++/4.8/bits/locale_classes.h" - "/usr/include/c++/4.8/bits/locale_classes.tcc" - "/usr/include/c++/4.8/bits/locale_facets.h" - "/usr/include/c++/4.8/bits/locale_facets.tcc" - "/usr/include/c++/4.8/bits/localefwd.h" - "/usr/include/c++/4.8/bits/memoryfwd.h" - "/usr/include/c++/4.8/bits/move.h" - "/usr/include/c++/4.8/bits/ostream.tcc" - "/usr/include/c++/4.8/bits/ostream_insert.h" - "/usr/include/c++/4.8/bits/postypes.h" - "/usr/include/c++/4.8/bits/range_access.h" - "/usr/include/c++/4.8/bits/sstream.tcc" - "/usr/include/c++/4.8/bits/stl_algo.h" - "/usr/include/c++/4.8/bits/stl_algobase.h" - "/usr/include/c++/4.8/bits/stl_bvector.h" - "/usr/include/c++/4.8/bits/stl_construct.h" - "/usr/include/c++/4.8/bits/stl_function.h" - "/usr/include/c++/4.8/bits/stl_heap.h" - "/usr/include/c++/4.8/bits/stl_iterator.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h" - "/usr/include/c++/4.8/bits/stl_iterator_base_types.h" - "/usr/include/c++/4.8/bits/stl_map.h" - "/usr/include/c++/4.8/bits/stl_multimap.h" - "/usr/include/c++/4.8/bits/stl_multiset.h" - "/usr/include/c++/4.8/bits/stl_pair.h" - "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h" - "/usr/include/c++/4.8/bits/stl_relops.h" - "/usr/include/c++/4.8/bits/stl_set.h" - "/usr/include/c++/4.8/bits/stl_tempbuf.h" - "/usr/include/c++/4.8/bits/stl_tree.h" - "/usr/include/c++/4.8/bits/stl_uninitialized.h" - "/usr/include/c++/4.8/bits/stl_vector.h" - "/usr/include/c++/4.8/bits/stream_iterator.h" - "/usr/include/c++/4.8/bits/streambuf.tcc" - "/usr/include/c++/4.8/bits/streambuf_iterator.h" - "/usr/include/c++/4.8/bits/stringfwd.h" - "/usr/include/c++/4.8/bits/vector.tcc" - "/usr/include/c++/4.8/cctype" - "/usr/include/c++/4.8/climits" - "/usr/include/c++/4.8/clocale" - "/usr/include/c++/4.8/cmath" - "/usr/include/c++/4.8/cstddef" - "/usr/include/c++/4.8/cstdio" - "/usr/include/c++/4.8/cstdlib" - "/usr/include/c++/4.8/cstring" - "/usr/include/c++/4.8/cwchar" - "/usr/include/c++/4.8/cwctype" - "/usr/include/c++/4.8/cxxabi.h" - "/usr/include/c++/4.8/debug/debug.h" - "/usr/include/c++/4.8/exception" - "/usr/include/c++/4.8/ext/alloc_traits.h" - "/usr/include/c++/4.8/ext/atomicity.h" - "/usr/include/c++/4.8/ext/new_allocator.h" - "/usr/include/c++/4.8/ext/numeric_traits.h" - "/usr/include/c++/4.8/ext/type_traits.h" - "/usr/include/c++/4.8/fstream" - "/usr/include/c++/4.8/functional" - "/usr/include/c++/4.8/ios" - "/usr/include/c++/4.8/iosfwd" - "/usr/include/c++/4.8/iostream" - "/usr/include/c++/4.8/istream" - "/usr/include/c++/4.8/iterator" - "/usr/include/c++/4.8/limits" - "/usr/include/c++/4.8/map" - "/usr/include/c++/4.8/memory" - "/usr/include/c++/4.8/new" - "/usr/include/c++/4.8/ostream" - "/usr/include/c++/4.8/set" - "/usr/include/c++/4.8/sstream" - "/usr/include/c++/4.8/stdexcept" - "/usr/include/c++/4.8/streambuf" - "/usr/include/c++/4.8/string" - "/usr/include/c++/4.8/typeinfo" - "/usr/include/c++/4.8/utility" - "/usr/include/c++/4.8/vector" - "/usr/include/ctype.h" - "/usr/include/endian.h" - "/usr/include/errno.h" - "/usr/include/features.h" - "/usr/include/getopt.h" - "/usr/include/inttypes.h" - "/usr/include/libio.h" - "/usr/include/limits.h" - "/usr/include/linux/errno.h" - "/usr/include/linux/limits.h" - "/usr/include/locale.h" - "/usr/include/math.h" - "/usr/include/pthread.h" - "/usr/include/sched.h" - "/usr/include/stdc-predef.h" - "/usr/include/stdint.h" - "/usr/include/stdio.h" - "/usr/include/stdlib.h" - "/usr/include/string.h" - "/usr/include/time.h" - "/usr/include/unistd.h" - "/usr/include/wchar.h" - "/usr/include/wctype.h" - "/usr/include/x86_64-linux-gnu/asm/errno.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h" - "/usr/include/x86_64-linux-gnu/bits/byteswap.h" - "/usr/include/x86_64-linux-gnu/bits/confname.h" - "/usr/include/x86_64-linux-gnu/bits/endian.h" - "/usr/include/x86_64-linux-gnu/bits/environments.h" - "/usr/include/x86_64-linux-gnu/bits/errno.h" - "/usr/include/x86_64-linux-gnu/bits/huge_val.h" - "/usr/include/x86_64-linux-gnu/bits/huge_valf.h" - "/usr/include/x86_64-linux-gnu/bits/huge_vall.h" - "/usr/include/x86_64-linux-gnu/bits/inf.h" - "/usr/include/x86_64-linux-gnu/bits/local_lim.h" - "/usr/include/x86_64-linux-gnu/bits/locale.h" - "/usr/include/x86_64-linux-gnu/bits/mathcalls.h" - "/usr/include/x86_64-linux-gnu/bits/mathdef.h" - "/usr/include/x86_64-linux-gnu/bits/mathinline.h" - "/usr/include/x86_64-linux-gnu/bits/nan.h" - "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h" - "/usr/include/x86_64-linux-gnu/bits/posix_opt.h" - "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h" - "/usr/include/x86_64-linux-gnu/bits/sched.h" - "/usr/include/x86_64-linux-gnu/bits/select.h" - "/usr/include/x86_64-linux-gnu/bits/select2.h" - "/usr/include/x86_64-linux-gnu/bits/setjmp.h" - "/usr/include/x86_64-linux-gnu/bits/sigset.h" - "/usr/include/x86_64-linux-gnu/bits/stdio.h" - "/usr/include/x86_64-linux-gnu/bits/stdio2.h" - "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h" - "/usr/include/x86_64-linux-gnu/bits/stdlib.h" - "/usr/include/x86_64-linux-gnu/bits/string3.h" - "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h" - "/usr/include/x86_64-linux-gnu/bits/time.h" - "/usr/include/x86_64-linux-gnu/bits/timex.h" - "/usr/include/x86_64-linux-gnu/bits/types.h" - "/usr/include/x86_64-linux-gnu/bits/typesizes.h" - "/usr/include/x86_64-linux-gnu/bits/unistd.h" - "/usr/include/x86_64-linux-gnu/bits/waitflags.h" - "/usr/include/x86_64-linux-gnu/bits/waitstatus.h" - "/usr/include/x86_64-linux-gnu/bits/wchar.h" - "/usr/include/x86_64-linux-gnu/bits/wchar2.h" - "/usr/include/x86_64-linux-gnu/bits/wordsize.h" - "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h" - "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h" - "/usr/include/x86_64-linux-gnu/gnu/stubs.h" - "/usr/include/x86_64-linux-gnu/sys/cdefs.h" - "/usr/include/x86_64-linux-gnu/sys/select.h" - "/usr/include/x86_64-linux-gnu/sys/sysmacros.h" - "/usr/include/x86_64-linux-gnu/sys/types.h" - "/usr/include/xlocale.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h" - "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h" - "/usr/local/cuda-6.5/include/CL/cl.h" - "/usr/local/cuda-6.5/include/CL/cl_ext.h" - "/usr/local/cuda-6.5/include/CL/cl_platform.h" - "/usr/local/cuda-6.5/include/builtin_types.h" - "/usr/local/cuda-6.5/include/channel_descriptor.h" - "/usr/local/cuda-6.5/include/common_functions.h" - "/usr/local/cuda-6.5/include/cuComplex.h" - "/usr/local/cuda-6.5/include/cublas_api.h" - "/usr/local/cuda-6.5/include/cublas_v2.h" - "/usr/local/cuda-6.5/include/cuda.h" - "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_runtime.h" - "/usr/local/cuda-6.5/include/cuda_runtime_api.h" - "/usr/local/cuda-6.5/include/cuda_surface_types.h" - "/usr/local/cuda-6.5/include/cuda_texture_types.h" - "/usr/local/cuda-6.5/include/curand.h" - "/usr/local/cuda-6.5/include/device_functions.h" - "/usr/local/cuda-6.5/include/device_launch_parameters.h" - "/usr/local/cuda-6.5/include/device_types.h" - "/usr/local/cuda-6.5/include/driver_functions.h" - "/usr/local/cuda-6.5/include/driver_types.h" - "/usr/local/cuda-6.5/include/host_config.h" - "/usr/local/cuda-6.5/include/host_defines.h" - "/usr/local/cuda-6.5/include/math_functions.h" - "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h" - "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_13_double_functions.h" - "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_20_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_30_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_32_intrinsics.h" - "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h" - "/usr/local/cuda-6.5/include/sm_35_intrinsics.h" - "/usr/local/cuda-6.5/include/surface_functions.h" - "/usr/local/cuda-6.5/include/surface_indirect_functions.h" - "/usr/local/cuda-6.5/include/surface_types.h" - "/usr/local/cuda-6.5/include/texture_fetch_functions.h" - "/usr/local/cuda-6.5/include/texture_indirect_functions.h" - "/usr/local/cuda-6.5/include/texture_types.h" - "/usr/local/cuda-6.5/include/thrust/advance.h" - "/usr/local/cuda-6.5/include/thrust/detail/advance.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/no_throw_allocator.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.inl" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.h" - "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.inl" - "/usr/local/cuda-6.5/include/thrust/detail/config.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/compiler.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/compiler_fence.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/config.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/debug.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/device_system.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/forceinline.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/global_workarounds.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/hd_warning_disable.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/host_device.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/host_system.h" - "/usr/local/cuda-6.5/include/thrust/detail/config/simple_defines.h" - "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.h" - "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.inl" - "/usr/local/cuda-6.5/include/thrust/detail/copy.h" - "/usr/local/cuda-6.5/include/thrust/detail/copy.inl" - "/usr/local/cuda-6.5/include/thrust/detail/cstdint.h" - "/usr/local/cuda-6.5/include/thrust/detail/device_free.inl" - "/usr/local/cuda-6.5/include/thrust/detail/device_malloc.inl" - "/usr/local/cuda-6.5/include/thrust/detail/device_ptr.inl" - "/usr/local/cuda-6.5/include/thrust/detail/device_reference.inl" - "/usr/local/cuda-6.5/include/thrust/detail/device_vector.inl" - "/usr/local/cuda-6.5/include/thrust/detail/dispatch/is_trivial_copy.h" - "/usr/local/cuda-6.5/include/thrust/detail/distance.inl" - "/usr/local/cuda-6.5/include/thrust/detail/equal.inl" - "/usr/local/cuda-6.5/include/thrust/detail/execution_policy.h" - "/usr/local/cuda-6.5/include/thrust/detail/extrema.inl" - "/usr/local/cuda-6.5/include/thrust/detail/fill.inl" - "/usr/local/cuda-6.5/include/thrust/detail/find.inl" - "/usr/local/cuda-6.5/include/thrust/detail/for_each.inl" - "/usr/local/cuda-6.5/include/thrust/detail/function.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional.inl" - "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.inl" - "/usr/local/cuda-6.5/include/thrust/detail/functional/argument.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/composite.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/arithmetic_operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/assignment_operator.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/bitwise_operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/compound_assignment_operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/logical_operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/operator_adaptors.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/relational_operators.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/placeholder.h" - "/usr/local/cuda-6.5/include/thrust/detail/functional/value.h" - "/usr/local/cuda-6.5/include/thrust/detail/generate.inl" - "/usr/local/cuda-6.5/include/thrust/detail/host_vector.inl" - "/usr/local/cuda-6.5/include/thrust/detail/internal_functional.h" - "/usr/local/cuda-6.5/include/thrust/detail/malloc_and_free.h" - "/usr/local/cuda-6.5/include/thrust/detail/minmax.h" - "/usr/local/cuda-6.5/include/thrust/detail/mismatch.inl" - "/usr/local/cuda-6.5/include/thrust/detail/numeric_traits.h" - "/usr/local/cuda-6.5/include/thrust/detail/overlapped_copy.h" - "/usr/local/cuda-6.5/include/thrust/detail/pair.inl" - "/usr/local/cuda-6.5/include/thrust/detail/pointer.h" - "/usr/local/cuda-6.5/include/thrust/detail/pointer.inl" - "/usr/local/cuda-6.5/include/thrust/detail/raw_pointer_cast.h" - "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.h" - "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.inl" - "/usr/local/cuda-6.5/include/thrust/detail/reduce.inl" - "/usr/local/cuda-6.5/include/thrust/detail/reference.h" - "/usr/local/cuda-6.5/include/thrust/detail/reference.inl" - "/usr/local/cuda-6.5/include/thrust/detail/reference_forward_declaration.h" - "/usr/local/cuda-6.5/include/thrust/detail/replace.inl" - "/usr/local/cuda-6.5/include/thrust/detail/scan.inl" - "/usr/local/cuda-6.5/include/thrust/detail/scatter.inl" - "/usr/local/cuda-6.5/include/thrust/detail/static_assert.h" - "/usr/local/cuda-6.5/include/thrust/detail/swap.h" - "/usr/local/cuda-6.5/include/thrust/detail/swap.inl" - "/usr/local/cuda-6.5/include/thrust/detail/swap_ranges.inl" - "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.h" - "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.inl" - "/usr/local/cuda-6.5/include/thrust/detail/temporary_buffer.h" - "/usr/local/cuda-6.5/include/thrust/detail/transform.inl" - "/usr/local/cuda-6.5/include/thrust/detail/transform_reduce.inl" - "/usr/local/cuda-6.5/include/thrust/detail/tuple.inl" - "/usr/local/cuda-6.5/include/thrust/detail/tuple_meta_transform.h" - "/usr/local/cuda-6.5/include/thrust/detail/tuple_transform.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/function_traits.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_member_function.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_nested_type.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_trivial_assign.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_call_possible.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_metafunction_defined.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_output_iterator.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/minimum_type.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/pointer_traits.h" - "/usr/local/cuda-6.5/include/thrust/detail/type_traits/result_of.h" - "/usr/local/cuda-6.5/include/thrust/detail/uninitialized_fill.inl" - "/usr/local/cuda-6.5/include/thrust/detail/use_default.h" - "/usr/local/cuda-6.5/include/thrust/detail/util/align.h" - "/usr/local/cuda-6.5/include/thrust/detail/util/blocking.h" - "/usr/local/cuda-6.5/include/thrust/detail/vector_base.h" - "/usr/local/cuda-6.5/include/thrust/detail/vector_base.inl" - "/usr/local/cuda-6.5/include/thrust/device_free.h" - "/usr/local/cuda-6.5/include/thrust/device_malloc.h" - "/usr/local/cuda-6.5/include/thrust/device_malloc_allocator.h" - "/usr/local/cuda-6.5/include/thrust/device_ptr.h" - "/usr/local/cuda-6.5/include/thrust/device_reference.h" - "/usr/local/cuda-6.5/include/thrust/device_vector.h" - "/usr/local/cuda-6.5/include/thrust/distance.h" - "/usr/local/cuda-6.5/include/thrust/equal.h" - "/usr/local/cuda-6.5/include/thrust/extrema.h" - "/usr/local/cuda-6.5/include/thrust/fill.h" - "/usr/local/cuda-6.5/include/thrust/find.h" - "/usr/local/cuda-6.5/include/thrust/for_each.h" - "/usr/local/cuda-6.5/include/thrust/functional.h" - "/usr/local/cuda-6.5/include/thrust/generate.h" - "/usr/local/cuda-6.5/include/thrust/host_vector.h" - "/usr/local/cuda-6.5/include/thrust/iterator/counting_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_assign.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_system_tag.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/counting_iterator.inl" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/device_system_tag.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/discard_iterator_base.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/distance_from_result.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/host_system_tag.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_iterator_category.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_trivial_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_adaptor_base.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_system.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_traversal.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_facade_category.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traits.inl" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traversal_tags.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_category.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_system.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/normal_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/permutation_iterator_base.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator.inl" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator_base.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/tagged_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/transform_iterator.inl" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/tuple_of_iterator_references.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/universal_categories.h" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator.inl" - "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator_base.h" - "/usr/local/cuda-6.5/include/thrust/iterator/discard_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/iterator_adaptor.h" - "/usr/local/cuda-6.5/include/thrust/iterator/iterator_categories.h" - "/usr/local/cuda-6.5/include/thrust/iterator/iterator_facade.h" - "/usr/local/cuda-6.5/include/thrust/iterator/iterator_traits.h" - "/usr/local/cuda-6.5/include/thrust/iterator/permutation_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/reverse_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/transform_iterator.h" - "/usr/local/cuda-6.5/include/thrust/iterator/zip_iterator.h" - "/usr/local/cuda-6.5/include/thrust/memory.h" - "/usr/local/cuda-6.5/include/thrust/mismatch.h" - "/usr/local/cuda-6.5/include/thrust/pair.h" - "/usr/local/cuda-6.5/include/thrust/reduce.h" - "/usr/local/cuda-6.5/include/thrust/replace.h" - "/usr/local/cuda-6.5/include/thrust/scan.h" - "/usr/local/cuda-6.5/include/thrust/scatter.h" - "/usr/local/cuda-6.5/include/thrust/swap.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/assign_value.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/copy.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/execution_policy.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/extrema.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/find.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/for_each.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/generate.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/get_value.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/iter_swap.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/malloc_and_free.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/swap_ranges.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/temporary_buffer.h" - "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/transform.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/assign_value.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/inclusive_scan.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/cuda_launch_config.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/alignment.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/uninitialized.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/error.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/execution_policy.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/extern_shared_ptr.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/get_value.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/iter_swap.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/malloc_and_free.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/swap_ranges.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/transform.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.h" - "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.inl" - "/usr/local/cuda-6.5/include/thrust/system/cuda/error.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/assign_value.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/copy.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/equal.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/extrema.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/fill.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/find.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/for_each.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/generate.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/get_value.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/iter_swap.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/malloc_and_free.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/mismatch.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/replace.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scatter.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/swap_ranges.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/temporary_buffer.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform_reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/adl/uninitialized_fill.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/bad_alloc.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/errno.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/error_category.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/error_code.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/error_condition.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/fill.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/for_each.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/select_system.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/tag.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/type_traits.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/decompose.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.inl" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/extrema.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/find.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/for_each.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/general_copy.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan_by_key.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/trivial_copy.h" - "/usr/local/cuda-6.5/include/thrust/system/detail/system_error.inl" - "/usr/local/cuda-6.5/include/thrust/system/error_code.h" - "/usr/local/cuda-6.5/include/thrust/system/system_error.h" - "/usr/local/cuda-6.5/include/thrust/system_error.h" - "/usr/local/cuda-6.5/include/thrust/transform.h" - "/usr/local/cuda-6.5/include/thrust/transform_reduce.h" - "/usr/local/cuda-6.5/include/thrust/tuple.h" - "/usr/local/cuda-6.5/include/thrust/uninitialized_fill.h" - "/usr/local/cuda-6.5/include/vector_functions.h" - "/usr/local/cuda-6.5/include/vector_types.h" - "/usr/local/include/boost/assert.hpp" - "/usr/local/include/boost/checked_delete.hpp" - "/usr/local/include/boost/config.hpp" - "/usr/local/include/boost/config/compiler/gcc.hpp" - "/usr/local/include/boost/config/compiler/nvcc.hpp" - "/usr/local/include/boost/config/no_tr1/memory.hpp" - "/usr/local/include/boost/config/no_tr1/utility.hpp" - "/usr/local/include/boost/config/platform/linux.hpp" - "/usr/local/include/boost/config/posix_features.hpp" - "/usr/local/include/boost/config/select_compiler_config.hpp" - "/usr/local/include/boost/config/select_platform_config.hpp" - "/usr/local/include/boost/config/select_stdlib_config.hpp" - "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp" - "/usr/local/include/boost/config/suffix.hpp" - "/usr/local/include/boost/config/user.hpp" - "/usr/local/include/boost/core/checked_delete.hpp" - "/usr/local/include/boost/core/demangle.hpp" - "/usr/local/include/boost/core/typeinfo.hpp" - "/usr/local/include/boost/current_function.hpp" - "/usr/local/include/boost/detail/sp_typeinfo.hpp" - "/usr/local/include/boost/detail/workaround.hpp" - "/usr/local/include/boost/exception/exception.hpp" - "/usr/local/include/boost/predef.h" - "/usr/local/include/boost/predef/architecture.h" - "/usr/local/include/boost/predef/architecture/alpha.h" - "/usr/local/include/boost/predef/architecture/arm.h" - "/usr/local/include/boost/predef/architecture/blackfin.h" - "/usr/local/include/boost/predef/architecture/convex.h" - "/usr/local/include/boost/predef/architecture/ia64.h" - "/usr/local/include/boost/predef/architecture/m68k.h" - "/usr/local/include/boost/predef/architecture/mips.h" - "/usr/local/include/boost/predef/architecture/parisc.h" - "/usr/local/include/boost/predef/architecture/ppc.h" - "/usr/local/include/boost/predef/architecture/pyramid.h" - "/usr/local/include/boost/predef/architecture/rs6k.h" - "/usr/local/include/boost/predef/architecture/sparc.h" - "/usr/local/include/boost/predef/architecture/superh.h" - "/usr/local/include/boost/predef/architecture/sys370.h" - "/usr/local/include/boost/predef/architecture/sys390.h" - "/usr/local/include/boost/predef/architecture/x86.h" - "/usr/local/include/boost/predef/architecture/x86/32.h" - "/usr/local/include/boost/predef/architecture/x86/64.h" - "/usr/local/include/boost/predef/architecture/z.h" - "/usr/local/include/boost/predef/compiler.h" - "/usr/local/include/boost/predef/compiler/borland.h" - "/usr/local/include/boost/predef/compiler/clang.h" - "/usr/local/include/boost/predef/compiler/comeau.h" - "/usr/local/include/boost/predef/compiler/compaq.h" - "/usr/local/include/boost/predef/compiler/diab.h" - "/usr/local/include/boost/predef/compiler/digitalmars.h" - "/usr/local/include/boost/predef/compiler/dignus.h" - "/usr/local/include/boost/predef/compiler/edg.h" - "/usr/local/include/boost/predef/compiler/ekopath.h" - "/usr/local/include/boost/predef/compiler/gcc.h" - "/usr/local/include/boost/predef/compiler/gcc_xml.h" - "/usr/local/include/boost/predef/compiler/greenhills.h" - "/usr/local/include/boost/predef/compiler/hp_acc.h" - "/usr/local/include/boost/predef/compiler/iar.h" - "/usr/local/include/boost/predef/compiler/ibm.h" - "/usr/local/include/boost/predef/compiler/intel.h" - "/usr/local/include/boost/predef/compiler/kai.h" - "/usr/local/include/boost/predef/compiler/llvm.h" - "/usr/local/include/boost/predef/compiler/metaware.h" - "/usr/local/include/boost/predef/compiler/metrowerks.h" - "/usr/local/include/boost/predef/compiler/microtec.h" - "/usr/local/include/boost/predef/compiler/mpw.h" - "/usr/local/include/boost/predef/compiler/palm.h" - "/usr/local/include/boost/predef/compiler/pgi.h" - "/usr/local/include/boost/predef/compiler/sgi_mipspro.h" - "/usr/local/include/boost/predef/compiler/sunpro.h" - "/usr/local/include/boost/predef/compiler/tendra.h" - "/usr/local/include/boost/predef/compiler/visualc.h" - "/usr/local/include/boost/predef/compiler/watcom.h" - "/usr/local/include/boost/predef/detail/_cassert.h" - "/usr/local/include/boost/predef/detail/_exception.h" - "/usr/local/include/boost/predef/detail/comp_detected.h" - "/usr/local/include/boost/predef/detail/os_detected.h" - "/usr/local/include/boost/predef/detail/test.h" - "/usr/local/include/boost/predef/language.h" - "/usr/local/include/boost/predef/language/objc.h" - "/usr/local/include/boost/predef/language/stdc.h" - "/usr/local/include/boost/predef/language/stdcpp.h" - "/usr/local/include/boost/predef/library.h" - "/usr/local/include/boost/predef/library/c.h" - "/usr/local/include/boost/predef/library/c/_prefix.h" - "/usr/local/include/boost/predef/library/c/gnu.h" - "/usr/local/include/boost/predef/library/c/uc.h" - "/usr/local/include/boost/predef/library/c/vms.h" - "/usr/local/include/boost/predef/library/c/zos.h" - "/usr/local/include/boost/predef/library/std.h" - "/usr/local/include/boost/predef/library/std/_prefix.h" - "/usr/local/include/boost/predef/library/std/cxx.h" - "/usr/local/include/boost/predef/library/std/dinkumware.h" - "/usr/local/include/boost/predef/library/std/libcomo.h" - "/usr/local/include/boost/predef/library/std/modena.h" - "/usr/local/include/boost/predef/library/std/msl.h" - "/usr/local/include/boost/predef/library/std/roguewave.h" - "/usr/local/include/boost/predef/library/std/sgi.h" - "/usr/local/include/boost/predef/library/std/stdcpp3.h" - "/usr/local/include/boost/predef/library/std/stlport.h" - "/usr/local/include/boost/predef/library/std/vacpp.h" - "/usr/local/include/boost/predef/make.h" - "/usr/local/include/boost/predef/os.h" - "/usr/local/include/boost/predef/os/aix.h" - "/usr/local/include/boost/predef/os/amigaos.h" - "/usr/local/include/boost/predef/os/android.h" - "/usr/local/include/boost/predef/os/beos.h" - "/usr/local/include/boost/predef/os/bsd.h" - "/usr/local/include/boost/predef/os/bsd/bsdi.h" - "/usr/local/include/boost/predef/os/bsd/dragonfly.h" - "/usr/local/include/boost/predef/os/bsd/free.h" - "/usr/local/include/boost/predef/os/bsd/net.h" - "/usr/local/include/boost/predef/os/bsd/open.h" - "/usr/local/include/boost/predef/os/cygwin.h" - "/usr/local/include/boost/predef/os/hpux.h" - "/usr/local/include/boost/predef/os/ios.h" - "/usr/local/include/boost/predef/os/irix.h" - "/usr/local/include/boost/predef/os/linux.h" - "/usr/local/include/boost/predef/os/macos.h" - "/usr/local/include/boost/predef/os/os400.h" - "/usr/local/include/boost/predef/os/qnxnto.h" - "/usr/local/include/boost/predef/os/solaris.h" - "/usr/local/include/boost/predef/os/unix.h" - "/usr/local/include/boost/predef/os/vms.h" - "/usr/local/include/boost/predef/os/windows.h" - "/usr/local/include/boost/predef/other.h" - "/usr/local/include/boost/predef/other/endian.h" - "/usr/local/include/boost/predef/platform.h" - "/usr/local/include/boost/predef/platform/mingw.h" - "/usr/local/include/boost/predef/platform/windows_desktop.h" - "/usr/local/include/boost/predef/platform/windows_phone.h" - "/usr/local/include/boost/predef/platform/windows_runtime.h" - "/usr/local/include/boost/predef/platform/windows_store.h" - "/usr/local/include/boost/predef/version_number.h" - "/usr/local/include/boost/shared_ptr.hpp" - "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp" - "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp" - "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp" - "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp" - "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp" - "/usr/local/include/boost/smart_ptr/shared_ptr.hpp" - "/usr/local/include/boost/throw_exception.hpp" - "/usr/local/include/gflags/gflags.h" - "/usr/local/include/gflags/gflags_declare.h" - "/usr/local/include/glog/log_severity.h" - "/usr/local/include/glog/logging.h" - "/usr/local/include/glog/vlog_is_on.h" -) - diff --git a/src/caffe/CMakeFiles/progress.marks b/src/caffe/CMakeFiles/progress.marks deleted file mode 100644 index abdfb053..00000000 --- a/src/caffe/CMakeFiles/progress.marks +++ /dev/null @@ -1 +0,0 @@ -60 diff --git a/src/caffe/CMakeFiles/proto.dir/CXX.includecache b/src/caffe/CMakeFiles/proto.dir/CXX.includecache deleted file mode 100644 index df68b9a9..00000000 --- a/src/caffe/CMakeFiles/proto.dir/CXX.includecache +++ /dev/null @@ -1,48 +0,0 @@ -#IncludeRegexLine: ^[ ]*#[ ]*(include|import)[ ]*[<"]([^">]+)([">]) - -#IncludeRegexScan: ^.*$ - -#IncludeRegexComplain: ^$ - -#IncludeRegexTransform: - -/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc -caffe.pb.h -/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h -algorithm -- -google/protobuf/stubs/common.h -- -google/protobuf/stubs/once.h -- -google/protobuf/io/coded_stream.h -- -google/protobuf/wire_format_lite_inl.h -- -google/protobuf/descriptor.h -- -google/protobuf/generated_message_reflection.h -- -google/protobuf/reflection_ops.h -- -google/protobuf/wire_format.h -- - -/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h -string -- -google/protobuf/stubs/common.h -- -google/protobuf/generated_message_util.h -- -google/protobuf/message.h -- -google/protobuf/repeated_field.h -- -google/protobuf/extension_set.h -- -google/protobuf/generated_enum_reflection.h -- -google/protobuf/unknown_field_set.h -- - diff --git a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake deleted file mode 100644 index 44c81e52..00000000 --- a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake +++ /dev/null @@ -1,39 +0,0 @@ -# The set of languages for which implicit dependencies are needed: -SET(CMAKE_DEPENDS_LANGUAGES - "CXX" - ) -# The set of files for implicit dependencies of each language: -SET(CMAKE_DEPENDS_CHECK_CXX - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" - ) -SET(CMAKE_CXX_COMPILER_ID "GNU") - -# Preprocessor definitions for this target. -SET(CMAKE_TARGET_DEFINITIONS - "GTEST_USE_OWN_TR1_TUPLE" - ) - -# Pairs of files generated by the same build rule. -SET(CMAKE_MULTIPLE_OUTPUT_PAIRS - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" - ) - - -# Targets to which this target links. -SET(CMAKE_TARGET_LINKED_INFO_FILES - ) - -# The include file search paths: -SET(CMAKE_C_TARGET_INCLUDE_PATH - "src" - "/usr/local/include" - "include" - "/usr/local/cuda/include" - "/usr/local/include/opencv" - "/usr/include/atlas" - "." - ) -SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/CMakeFiles/proto.dir/build.make b/src/caffe/CMakeFiles/proto.dir/build.make deleted file mode 100644 index 1467c124..00000000 --- a/src/caffe/CMakeFiles/proto.dir/build.make +++ /dev/null @@ -1,119 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Remove some rules from gmake that .SUFFIXES does not remove. -SUFFIXES = - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E remove -f - -# Escaping for special characters. -EQUALS = = - -# The program to use to edit the cache. -CMAKE_EDIT_COMMAND = /usr/bin/ccmake - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# Include any dependencies generated for this target. -include src/caffe/CMakeFiles/proto.dir/depend.make - -# Include the progress variables for this target. -include src/caffe/CMakeFiles/proto.dir/progress.make - -# Include the compile flags for this target's objects. -include src/caffe/CMakeFiles/proto.dir/flags.make - -include/caffe/proto/caffe.pb.cc: src/caffe/proto/caffe.proto - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Running C++/Python protocol buffer compiler on /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --cpp_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --python_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto - -include/caffe/proto/caffe.pb.h: include/caffe/proto/caffe.pb.cc - -include/caffe/proto/caffe_pb2.py: include/caffe/proto/caffe.pb.cc - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: src/caffe/CMakeFiles/proto.dir/flags.make -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc > CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires: -.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires - $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build -.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o - -# Object files for target proto -proto_OBJECTS = \ -"CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" - -# External object files for target proto -proto_EXTERNAL_OBJECTS = - -lib/libproto.a: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -lib/libproto.a: src/caffe/CMakeFiles/proto.dir/build.make -lib/libproto.a: src/caffe/CMakeFiles/proto.dir/link.txt - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libproto.a" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean_target.cmake - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/proto.dir/link.txt --verbose=$(VERBOSE) - -# Rule to build all files generated by this target. -src/caffe/CMakeFiles/proto.dir/build: lib/libproto.a -.PHONY : src/caffe/CMakeFiles/proto.dir/build - -src/caffe/CMakeFiles/proto.dir/requires: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires -.PHONY : src/caffe/CMakeFiles/proto.dir/requires - -src/caffe/CMakeFiles/proto.dir/clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean.cmake -.PHONY : src/caffe/CMakeFiles/proto.dir/clean - -src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.cc -src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.h -src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe_pb2.py - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake --color=$(COLOR) -.PHONY : src/caffe/CMakeFiles/proto.dir/depend - diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake deleted file mode 100644 index 79cb425a..00000000 --- a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake +++ /dev/null @@ -1,13 +0,0 @@ -FILE(REMOVE_RECURSE - "../../include/caffe/proto/caffe.pb.cc" - "../../include/caffe/proto/caffe.pb.h" - "../../include/caffe/proto/caffe_pb2.py" - "CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o" - "../../lib/libproto.pdb" - "../../lib/libproto.a" -) - -# Per-language clean rules from dependency scanning. -FOREACH(lang CXX) - INCLUDE(CMakeFiles/proto.dir/cmake_clean_${lang}.cmake OPTIONAL) -ENDFOREACH(lang) diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake deleted file mode 100644 index 6172b692..00000000 --- a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake +++ /dev/null @@ -1,3 +0,0 @@ -FILE(REMOVE_RECURSE - "../../lib/libproto.a" -) diff --git a/src/caffe/CMakeFiles/proto.dir/depend.internal b/src/caffe/CMakeFiles/proto.dir/depend.internal deleted file mode 100644 index 2f8ec677..00000000 --- a/src/caffe/CMakeFiles/proto.dir/depend.internal +++ /dev/null @@ -1,6 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o - /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc - /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h diff --git a/src/caffe/CMakeFiles/proto.dir/depend.make b/src/caffe/CMakeFiles/proto.dir/depend.make deleted file mode 100644 index 239c4242..00000000 --- a/src/caffe/CMakeFiles/proto.dir/depend.make +++ /dev/null @@ -1,6 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc -src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.h - diff --git a/src/caffe/CMakeFiles/proto.dir/flags.make b/src/caffe/CMakeFiles/proto.dir/flags.make deleted file mode 100644 index 8b4ef992..00000000 --- a/src/caffe/CMakeFiles/proto.dir/flags.make +++ /dev/null @@ -1,8 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# compile CXX with /usr/bin/c++ -CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe - -CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE - diff --git a/src/caffe/CMakeFiles/proto.dir/link.txt b/src/caffe/CMakeFiles/proto.dir/link.txt deleted file mode 100644 index 42f85bda..00000000 --- a/src/caffe/CMakeFiles/proto.dir/link.txt +++ /dev/null @@ -1,2 +0,0 @@ -/usr/bin/ar cr ../../lib/libproto.a CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -/usr/bin/ranlib ../../lib/libproto.a diff --git a/src/caffe/CMakeFiles/proto.dir/progress.make b/src/caffe/CMakeFiles/proto.dir/progress.make deleted file mode 100644 index 25d32761..00000000 --- a/src/caffe/CMakeFiles/proto.dir/progress.make +++ /dev/null @@ -1,3 +0,0 @@ -CMAKE_PROGRESS_1 = 67 -CMAKE_PROGRESS_2 = - diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt deleted file mode 100644 index 40e6c11f..00000000 --- a/src/caffe/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -# generate protobuf sources -file(GLOB proto_files proto/*.proto) -caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files}) - -# include python files either to force generation -add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python}) -set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend! -caffe_default_properties(proto) - -# --[ Caffe library - -# creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists -caffe_pickup_caffe_sources(${PROJECT_SOURCE_DIR}) - -if(HAVE_CUDA) - caffe_cuda_compile(cuda_objs ${cuda}) - list(APPEND srcs ${cuda_objs} ${cuda}) -endif() - -add_library(caffe ${srcs}) -target_link_libraries(caffe proto ${Caffe_LINKER_LIBS}) -caffe_default_properties(caffe) - -# ---[ Tests - add_subdirectory(test) - -# ---[ Install -install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include) -install(FILES ${proto_hdrs} DESTINATION include/caffe/proto) -install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib) - -file(WRITE ${PROJECT_BINARY_DIR}/__init__.py) -list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py) -install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto) - - From 04d42ec6da24f922eee084e0b6b75f3f427db5c9 Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 13 Sep 2015 23:44:02 +0800 Subject: [PATCH 101/124] Enable CPU_ONLY flag --- include/caffe/common.hpp | 5 +- include/caffe/device.hpp | 5 +- include/caffe/syncedmem.hpp | 10 +- include/caffe/util/im2col.hpp | 2 + include/caffe/util/math_functions.hpp | 2 - include/caffe/util/ocl_util.hpp | 3 +- include/caffe/util/ocl_wrapper.hpp | 2 + include/caffe/vision_layers.hpp | 3 +- src/caffe/device.cpp | 2 + src/caffe/layers/absval_layer.cpp | 3 +- src/caffe/layers/base_conv_layer.cpp | 5 + src/caffe/layers/base_data_layer.cpp | 7 +- src/caffe/layers/bnll_layer.cpp | 3 +- src/caffe/layers/concat_layer.cpp | 3 +- src/caffe/layers/contrastive_loss_layer.cpp | 3 +- src/caffe/layers/conv_layer.cpp | 3 +- src/caffe/layers/deconv_layer.cpp | 4 +- src/caffe/layers/dropout_layer.cpp | 3 +- src/caffe/layers/eltwise_layer.cpp | 3 +- src/caffe/layers/euclidean_loss_layer.cpp | 3 +- src/caffe/layers/exp_layer.cpp | 3 +- src/caffe/layers/filter_layer.cpp | 3 +- src/caffe/layers/hdf5_data_layer.cpp | 3 +- src/caffe/layers/hdf5_output_layer.cpp | 3 +- src/caffe/layers/im2col_layer.cpp | 3 +- src/caffe/layers/inner_product_layer.cpp | 3 +- src/caffe/layers/log_layer.cpp | 3 +- src/caffe/layers/lrn_layer.cpp | 3 +- src/caffe/layers/mvn_layer.cpp | 3 +- src/caffe/layers/pooling_layer.cpp | 3 +- src/caffe/layers/power_layer.cpp | 3 +- src/caffe/layers/prelu_layer.cpp | 3 +- src/caffe/layers/reduction_layer.cpp | 3 +- src/caffe/layers/relu_layer.cpp | 3 +- .../sigmoid_cross_entropy_loss_layer.cpp | 3 +- src/caffe/layers/sigmoid_layer.cpp | 4 +- src/caffe/layers/silence_layer.cpp | 3 +- src/caffe/layers/slice_layer.cpp | 4 +- src/caffe/layers/softmax_layer.cpp | 4 +- src/caffe/layers/softmax_loss_layer.cpp | 12 +- src/caffe/layers/split_layer.cpp | 5 +- src/caffe/layers/tanh_layer.cpp | 3 +- src/caffe/layers/threshold_layer.cpp | 3 +- src/caffe/net.cpp | 4 + src/caffe/solver.cpp | 10 - src/caffe/syncedmem.cpp | 8 + src/caffe/util/im2col.cpp | 3 +- src/caffe/util/math_functions.cpp | 917 +++++++++--------- src/caffe/util/ocl_util.cpp | 4 + src/caffe/util/ocl_wrapper.cpp | 5 + 50 files changed, 574 insertions(+), 534 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index f5c65eb9..7aed6007 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -1,7 +1,6 @@ #ifndef CAFFE_COMMON_HPP_ #define CAFFE_COMMON_HPP_ -#include #include #include #include @@ -16,8 +15,12 @@ #include #include // pair #include + +#ifndef CPU_ONLY #include #include +#include +#endif #include "caffe/device.hpp" #include "caffe/util/device_alternate.hpp" diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp index 1d9fa6fe..b6190f28 100644 --- a/include/caffe/device.hpp +++ b/include/caffe/device.hpp @@ -26,12 +26,11 @@ #ifndef CAFFE_DEVICE_HPP #define CAFFE_DEVICE_HPP -#include #include #include #include "caffe/common.hpp" namespace caffe { - +#ifndef CPU_ONLY class Device { public: Device() @@ -80,7 +79,7 @@ class Device { }; extern std::string buildOption; extern Device amdDevice; - +#endif } // namespace caffe #endif //CAFFE_DEVICE_HPP diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 1647b6f3..4092b5ac 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -68,12 +68,16 @@ class SyncedMemory { SyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_( false), data_layer_(false) { - ocl_setup(); +#ifndef CPU_ONLY + ocl_setup(); +#endif } explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_( false), data_layer_(false) { - ocl_setup(); +#ifndef CPU_ONLY + ocl_setup(); +#endif } ~SyncedMemory(); @@ -95,8 +99,10 @@ class SyncedMemory { void set_data_layer() { data_layer_ = true; } +#ifndef CPU_ONLY private: void ocl_setup(); +#endif protected: cl_kernel oclmem_kernel; diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index f962049d..531b11ad 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -39,6 +39,7 @@ void col2im_cpu(const Dtype* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); +#ifndef CPU_ONLY template void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, @@ -97,6 +98,7 @@ template void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col, cl_kernel Kernel); +#endif } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index d7c67673..7178ea74 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -116,8 +116,6 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) { inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { #ifndef CPU_ONLY ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N); -#else - NO_GPU; #endif } diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 776fec11..00bfa3cf 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -28,7 +28,7 @@ #define _CAFFE_UTIL_OCL_UTIL_HPP_ namespace caffe { - +#ifndef CPU_ONLY template void ocl_memset(Dtype* buffer, const Dtype value, const int count); @@ -36,6 +36,7 @@ void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count); void eventCallback(cl_event event, cl_int event_status, void * user_data); +#endif } // namespace caffe #endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 14cf48e9..a1d11d18 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -49,6 +49,7 @@ template inline std::string get_dtype_suffix() { return suffix; } +#ifndef CPU_ONLY template void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num); @@ -339,6 +340,7 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a, template void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, const int* mask, Dtype* bottom_diff); +#endif } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index bc6cd5de..e2a9b190 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -232,7 +232,7 @@ class ConvolutionLayer: public BaseConvolutionLayer { return false; } virtual void compute_output_shape(); - +#ifndef CPU_ONLY virtual void Forward_gpu_org(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, @@ -241,6 +241,7 @@ class ConvolutionLayer: public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_opt2(const vector*>& top, const vector& propagate_down, const vector*>& bottom); +#endif }; /** diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp index bb8f9cb6..fcbffe09 100644 --- a/src/caffe/device.cpp +++ b/src/caffe/device.cpp @@ -33,6 +33,7 @@ #include namespace caffe { +#ifndef CPU_ONLY string buildOption = "-x clc++ "; std::string oclKernelPath = "./src/caffe/ocl/"; Device amdDevice; @@ -420,5 +421,6 @@ void Device::appendBitfield(T info, T value, std::string name, } } +#endif } // namespace caffe diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 945162af..6e06b558 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -35,6 +35,7 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void AbsValLayer::Forward_gpu(const vector*>& bottom, @@ -57,7 +58,7 @@ void AbsValLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(AbsValLayer); #endif diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index ee0df02f..04cd38dd 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -9,6 +9,7 @@ namespace caffe { +#ifndef CPU_ONLY #ifdef use_packing_scheme template size_t BaseConvolutionLayer::subtop_mem_size = sizeof(Dtype); template size_t BaseConvolutionLayer::trans_mem_size = sizeof(Dtype); @@ -46,6 +47,8 @@ void BaseConvolutionLayer::ocl_setup() { #endif } +#endif + template BaseConvolutionLayer::~BaseConvolutionLayer() { } @@ -204,8 +207,10 @@ void BaseConvolutionLayer::Reshape(const vector*>& bottom, caffe_set(bias_multiplier_.count(), Dtype(1), bias_multiplier_.mutable_cpu_data()); } +#ifndef CPU_ONLY //initializa OpenCL kernels and cl_mem objects ocl_setup(); +#endif } template diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index d02e92c4..ff4436a7 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -80,6 +80,8 @@ void BasePrefetchingDataLayer::Forward_cpu( CreatePrefetchThread(); } +#ifndef CPU_ONLY + template void BasePrefetchingDataLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { @@ -104,15 +106,12 @@ void BasePrefetchingDataLayer::Forward_gpu( 0, NULL, NULL)); } -#ifdef Track_data_transfer -#endif - // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; CreatePrefetchThread(); } -#ifdef CPU_ONLY +#else STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); #endif diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index c2cce9e3..ed9cc1d4 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -38,6 +38,7 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void BNLLLayer::Forward_gpu(const vector*>& bottom, @@ -62,7 +63,7 @@ void BNLLLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(BNLLLayer); #endif diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 5a351009..5cceb9ff 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -94,6 +94,7 @@ void ConcatLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void ConcatLayer::Forward_gpu(const vector*>& bottom, @@ -139,7 +140,7 @@ void ConcatLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(ConcatLayer); #endif diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index a8e6f523..6dda7d61 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -101,6 +101,7 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void ContrastiveLossLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { @@ -156,7 +157,7 @@ void ContrastiveLossLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(ContrastiveLossLayer); #endif diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 9c250c42..b64eb1aa 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -70,6 +70,7 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -228,7 +229,7 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(ConvolutionLayer); #endif diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index 8ee81c9f..2504f43a 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -69,6 +69,8 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY + template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -125,7 +127,7 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(DeconvolutionLayer); #endif diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 17196f10..2cb50ead 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -66,6 +66,7 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code is written/modified by AMD template void DropoutLayer::Forward_gpu(const vector*>& bottom, @@ -103,7 +104,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(DropoutLayer); #endif diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index e2e5e1ab..971703f4 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -155,6 +155,7 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void EltwiseLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -241,7 +242,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(EltwiseLayer); #endif diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index fce99953..2130c6f4 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -43,6 +43,7 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, @@ -72,7 +73,7 @@ void EuclideanLossLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(EuclideanLossLayer); #endif diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 25bcd0a0..3fe7cde4 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -62,6 +62,7 @@ void ExpLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void ExpLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -95,7 +96,7 @@ void ExpLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(ExpLayer); #endif diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index fc3ca142..2cd9957d 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -118,6 +118,7 @@ void FilterLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void FilterLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -177,7 +178,7 @@ void FilterLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(FilterLayer); #endif diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 2d7d405e..28eee444 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -159,6 +159,7 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -197,7 +198,7 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, } } -#ifdef CPU_ONLY +#else STUB_GPU_FORWARD(HDF5DataLayer, Forward); #endif diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index f9215a3d..11d01647 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -67,6 +67,7 @@ void HDF5OutputLayer::Backward_cpu(const vector*>& top, return; } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, @@ -103,7 +104,7 @@ void HDF5OutputLayer::Backward_gpu(const vector*>& top, return; } -#ifdef CPU_ONLY +#else STUB_GPU(HDF5OutputLayer); #endif diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 886782b9..f51fd7cc 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -87,6 +87,7 @@ void Im2colLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void Im2colLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -111,7 +112,7 @@ void Im2colLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(Im2colLayer); #endif diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index b9ae3370..b40e3e7d 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -119,6 +119,7 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void InnerProductLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -157,7 +158,7 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(InnerProductLayer); #endif diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index f6ace662..5dbbca74 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -79,6 +79,7 @@ void LogLayer::Backward_cpu(const vector*>& top, caffe_mul(count, top_diff, bottom_diff, bottom_diff); } +#ifndef CPU_ONLY template void LogLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -126,7 +127,7 @@ void LogLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } -#ifdef CPU_ONLY +#else STUB_GPU(LogLayer); #endif diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 00e554bd..da3d1fc3 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -251,6 +251,7 @@ void LRNLayer::WithinChannelBackward(const vector*>& top, } } +#ifndef CPU_ONLY template void LRNLayer::CrossChannelForward_gpu( const vector*>& bottom, const vector*>& top) { @@ -309,7 +310,7 @@ void LRNLayer::Backward_gpu(const vector*>& top, LOG(FATAL) << "Unknown normalization region."; } } -#ifdef CPU_ONLY +#else STUB_GPU(LRNLayer); STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward); diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 64c3063f..2c4acb14 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -122,6 +122,7 @@ void MVNLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void MVNLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -228,7 +229,7 @@ void MVNLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(MVNLayer); #endif diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 85c57379..0becf164 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -313,6 +313,7 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY // begin: code written/modified by AMD template void PoolingLayer::Forward_gpu(const vector*>& bottom, @@ -409,7 +410,7 @@ void PoolingLayer::Backward_gpu(const vector*>& top, } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(PoolingLayer); #endif diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 6b2c5f1d..a0f5ccee 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -96,6 +96,7 @@ void PowerLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void PowerLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -169,7 +170,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, } } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(PowerLayer); #endif diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 8ec6664d..75aa3968 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -128,6 +128,7 @@ void PReLULayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void PReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -197,7 +198,7 @@ void PReLULayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(PReLULayer); #endif diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 89df6589..9ec057b1 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -124,6 +124,7 @@ void ReductionLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void ReductionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -207,7 +208,7 @@ void ReductionLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(ReductionLayer); #endif diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index b07e6447..132d7b4b 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -34,6 +34,7 @@ void ReLULayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void ReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -57,7 +58,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(ReLULayer); #endif diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 4048a8e8..f074ac51 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -71,6 +71,7 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( } } +#ifndef CPU_ONLY template void SigmoidCrossEntropyLossLayer::Backward_gpu( const vector*>& top, const vector& propagate_down, @@ -94,7 +95,7 @@ void SigmoidCrossEntropyLossLayer::Backward_gpu( } } -#ifdef CPU_ONLY +#else STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward); #endif diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index a4359920..737bff74 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -39,6 +39,8 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY + template void SigmoidLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -62,7 +64,7 @@ void SigmoidLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(SigmoidLayer); #endif diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index 1c463499..a6c30fbb 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -16,6 +16,7 @@ void SilenceLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void SilenceLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -33,7 +34,7 @@ void SilenceLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(SilenceLayer); #endif diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index da4059a0..8263b92b 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -110,7 +110,7 @@ void SliceLayer::Backward_cpu(const vector*>& top, offset_slice_axis += top_slice_axis; } } - +#ifndef CPU_ONLY template void SliceLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -121,7 +121,7 @@ void SliceLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { } -#ifdef CPU_ONLY +#else STUB_GPU(SliceLayer); #endif diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 92162821..366946bd 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -89,6 +89,8 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } + +#ifndef CPU_ONLY // begin: code written/modified by AMD template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, @@ -147,7 +149,7 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(SoftmaxLayer); #endif diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 62c10e30..2241bd6c 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -27,15 +27,6 @@ void SoftmaxWithLossLayer::LayerSetUp(const vector*>& bottom, ignore_label_ = this->layer_param_.loss_param().ignore_label(); } normalize_ = this->layer_param_.loss_param().normalize(); - - ocl_setup(); -} - -template -void SoftmaxWithLossLayer::ocl_setup() { - d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, - sizeof(Dtype), NULL, NULL); - } template @@ -134,6 +125,7 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } // begin: code written/modified by AMD +#ifndef CPU_ONLY template void SoftmaxWithLossLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { @@ -200,7 +192,7 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, } } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(SoftmaxWithLossLayer); #endif diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 7a40bf8a..57677b5b 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -21,8 +21,6 @@ void SplitLayer::Reshape(const vector*>& bottom, top[i]->ReshapeLike(*bottom[0]); CHECK_EQ(count_, top[i]->count()); } - gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", - NULL); } template @@ -53,6 +51,7 @@ void SplitLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void SplitLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -82,7 +81,7 @@ void SplitLayer::Backward_gpu(const vector*>& top, } } // end: code written/modified by AMD -#ifdef CPU_ONLY +#else STUB_GPU(SplitLayer); #endif diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 3e85330c..7a15809d 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -37,6 +37,7 @@ void TanHLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY template void TanHLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -60,7 +61,7 @@ void TanHLayer::Backward_gpu(const vector*>& top, } } -#ifdef CPU_ONLY +#else STUB_GPU(TanHLayer); #endif diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index 16ca8944..a4c543ee 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -24,6 +24,7 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, } } +#ifndef CPU_ONLY template void ThresholdLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -34,7 +35,7 @@ void ThresholdLayer::Forward_gpu(const vector*>& bottom, ThresholdForward(count, threshold_, bottom_data, top_data); } -#ifdef CPU_ONLY +#else STUB_GPU_FORWARD(ThresholdLayer, Forward); #endif diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 6911854c..711ec408 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -528,7 +528,9 @@ Dtype Net::ForwardFromTo(int start, int end) { if (debug_info_) { ForwardDebugInfo(i); } +#ifndef CPU_ONLY clFinish(amdDevice.CommandQueue); +#endif layer_timer.Stop(); printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); @@ -608,7 +610,9 @@ void Net::BackwardFromTo(int start, int end) { if (debug_info_) { BackwardDebugInfo(i); } +#ifndef CPU_ONLY clFinish(amdDevice.CommandQueue); +#endif layer_timer.Start(); printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds()); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 8d7f8238..20af4160 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -19,14 +19,6 @@ Solver::Solver(const SolverParameter& param) Init(param); } -template -void Solver::ocl_setup() { - scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL); - add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL); - div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL); - powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL); -} - template Solver::Solver(const string& param_file) : net_() { @@ -42,8 +34,6 @@ void Solver::Init(const SolverParameter& param) { param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; - ocl_setup(); - if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index db470434..a3fa9973 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -36,6 +36,7 @@ namespace caffe { SyncedMemory::~SyncedMemory() { +#ifndef CPU_ONLY if (cpu_ptr_ && own_cpu_data_) { OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, @@ -50,23 +51,30 @@ SyncedMemory::~SyncedMemory() { } clReleaseKernel (oclmem_kernel); +#endif } //begin: code written/modified by AMD. +#ifndef CPU_ONLY void SyncedMemory::ocl_setup() { cl_int err = 0; oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); OCL_CHECK(err); } +#endif inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: +#ifndef CPU_ONLY gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL); cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); +#else + CaffeMallocHost(&cpu_ptr_, size_); +#endif memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 25349d26..ab023e70 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -103,6 +103,7 @@ template void col2im_cpu(const double* data_col, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im); +#ifndef CPU_ONLY template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, @@ -366,5 +367,5 @@ template void col2im_gpu(const float* data_col, const int col_offset, template void col2im_gpu(const double* data_col, const int col_offset, const int channels, const int height, const int width, const int psize, const int pad, const int stride, double* data_im, const int img_offset); - +#endif } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 96ec98b1..d1cfc954 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -62,6 +62,395 @@ void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, beta, C, N); } +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_axpy(const int N, const float alpha, const float* X, + float* Y) { + cblas_saxpy(N, alpha, X, 1, Y, 1); +} + +template <> +void caffe_axpy(const int N, const double alpha, const double* X, + double* Y) { + cblas_daxpy(N, alpha, X, 1, Y, 1); +} + +template <> +void caffe_set(const int N, const float alpha, float* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(float) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template <> +void caffe_set(const int N, const double alpha, double* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(double) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template <> +void caffe_add_scalar(const int N, const float alpha, float* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template <> +void caffe_add_scalar(const int N, const double alpha, double* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template <> +void caffe_copy(const int N, const float* X, float* Y) { + cblas_scopy(N, X, 1, Y, 1); +} + +template <> +void caffe_copy(const int N, const double* X, double* Y) { + cblas_dcopy(N, X, 1, Y, 1); +} + +template <> +void caffe_scal(const int N, const float alpha, float *X) { + cblas_sscal(N, alpha, X, 1); +} + +template <> +void caffe_scal(const int N, const double alpha, double *X) { + cblas_dscal(N, alpha, X, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_add(const int n, const float* a, const float* b, float* y) { + vsAdd(n, a, b, y); +} + +template <> +void caffe_add(const int n, const double* a, const double* b, + double* y) { + vdAdd(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const float* a, const float* b, float* y) { + vsSub(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const double* a, const double* b, + double* y) { + vdSub(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const float* a, const float* b, float* y) { + vsMul(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const double* a, const double* b, + double* y) { + vdMul(n, a, b, y); +} + +template <> +float caffe_cpu_strided_dot(const int n, const float* x, const int incx, + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); +} + +template <> +double caffe_cpu_strided_dot(const int n, const double* x, + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); +} + +template +void caffe_set(const int N, const Dtype alpha, Dtype* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template void caffe_set(const int N, const int alpha, int* Y); +template void caffe_set(const int N, const float alpha, float* Y); +template void caffe_set(const int N, const double alpha, double* Y); + +template <> +void caffe_log(const int n, const float* a, float* y) { + vsLn(n, a, y); +} + +template <> +void caffe_log(const int n, const double* a, double* y) { + vdLn(n, a, y); +} + +template +void caffe_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + // NOLINT_NEXT_LINE(caffe/alt_fn) + //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); +#else + NO_GPU; +#endif + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } +} + +template void caffe_copy(const int N, const int* X, int* Y); +template void caffe_copy(const int N, const unsigned int* X, + unsigned int* Y); +template void caffe_copy(const int N, const float* X, float* Y); +template void caffe_copy(const int N, const double* X, double* Y); + +template <> +void caffe_abs(const int n, const float* a, float* y) { + vsAbs(n, a, y); +} + +template <> +void caffe_abs(const int n, const double* a, double* y) { + vdAbs(n, a, y); +} + +template <> +void caffe_div(const int n, const float* a, const float* b, float* y) { + vsDiv(n, a, b, y); +} + +template <> +void caffe_div(const int n, const double* a, const double* b, + double* y) { + vdDiv(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const float* a, const float b, float* y) { + vsPowx(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const double* a, const double b, + double* y) { + vdPowx(n, a, b, y); +} + +template <> +void caffe_sqr(const int n, const float* a, float* y) { + vsSqr(n, a, y); +} + +template <> +void caffe_sqr(const int n, const double* a, double* y) { + vdSqr(n, a, y); +} + +template <> +void caffe_exp(const int n, const float* a, float* y) { + vsExp(n, a, y); +} + +template <> +void caffe_exp(const int n, const double* a, double* y) { + vdExp(n, a, y); +} + +unsigned int caffe_rng_rand() { + return (*caffe_rng())(); +} + +template +Dtype caffe_nextafter(const Dtype b) { + return boost::math::nextafter < Dtype + > (b, std::numeric_limits < Dtype > ::max()); +} +template float caffe_nextafter(const float b); +template double caffe_nextafter(const double b); + +template +void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_LE(a, b); + boost::uniform_real < Dtype + > random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } + + //LOG(INFO) << "caffe_rng_uniform"; +} + +template void caffe_rng_uniform(const int n, const float a, const float b, + float* r); +template void caffe_rng_uniform(const int n, const double a, const double b, + double* r); + +template +void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma, + Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GT(sigma, 0); + boost::normal_distribution < Dtype > random_distribution(a, sigma); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template void caffe_rng_gaussian(const int n, const float mu, const float sigma, + float* r); +template void caffe_rng_gaussian(const int n, const double mu, + const double sigma, double* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template void caffe_rng_bernoulli(const int n, const double p, int* r); +template void caffe_rng_bernoulli(const int n, const float p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } +} + +template void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); +template void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); +// +template <> +float caffe_cpu_dot(const int n, const float* x, const float* y) { + return cblas_sdot(n, x, 1, y, 1); +} + +template <> +double caffe_cpu_dot(const int n, const double* x, const double* y) { + return cblas_ddot(n, x, 1, y, 1); +} + +template <> +int caffe_cpu_hamming_distance(const int n, const float* x, + const float* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcount( + static_cast(x[i]) ^ static_cast(y[i])); + } + return dist; +} + +template <> +int caffe_cpu_hamming_distance(const int n, const double* x, + const double* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcountl( + static_cast(x[i]) ^ static_cast(y[i])); + } + return dist; +} + +template <> +float caffe_cpu_asum(const int n, const float* x) { + return cblas_sasum(n, x, 1); +} + +template <> +double caffe_cpu_asum(const int n, const double* x) { + return cblas_dasum(n, x, 1); +} + +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); + +template <> +void caffe_cpu_scale(const int n, const float alpha, const float *x, + float* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); +} + +template <> +void caffe_cpu_scale(const int n, const double alpha, const double *x, + double* y) { + cblas_dcopy(n, x, 1, y, 1); + cblas_dscal(n, alpha, y, 1); +} + +#ifndef CPU_ONLY +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) +// - (x[index] < Dtype(0))); +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); + template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, @@ -183,20 +572,6 @@ cl_event caffe_gpu_gemm(cl_command_queue *queue, return event; } -template <> -void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, const float* x, - const float beta, float* y) { - cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); -} - -template <> -void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, const double* x, - const double beta, double* y) { - cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); -} - template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, size_t offA, int lda, @@ -221,7 +596,6 @@ void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); - } template <> @@ -248,18 +622,6 @@ void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, &(amdDevice.CommandQueue), 0, NULL, NULL)); } -template <> -void caffe_axpy(const int N, const float alpha, const float* X, - float* Y) { - cblas_saxpy(N, alpha, X, 1, Y, 1); -} - -template <> -void caffe_axpy(const int N, const double alpha, const double* X, - double* Y) { - cblas_daxpy(N, alpha, X, 1, Y, 1); -} - template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, float* Y) { @@ -277,348 +639,94 @@ void caffe_gpu_axpy(const int N, const double alpha, const double* X, } template <> -void caffe_gpu_sgnbit(const int n, const float* x, float* y) { -} - -template <> -void caffe_gpu_sgnbit(const int n, const double* x, double* y) { -} - -template <> -void caffe_gpu_abs(const int n, const float* x, float* y) { - caffe_gpu_abs_ocl(n, x, y); -} - -template <> -void caffe_gpu_abs(const int n, const double* x, double* y) { - caffe_gpu_abs_ocl(n, x, y); -} - -template <> -void caffe_set(const int N, const float alpha, float* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(float) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } -} - -template <> -void caffe_set(const int N, const double alpha, double* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(double) * N); - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } -} - -template <> -void caffe_add_scalar(const int N, const float alpha, float* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } -} - -template <> -void caffe_add_scalar(const int N, const double alpha, double* Y) { - for (int i = 0; i < N; ++i) { - Y[i] += alpha; - } -} - -template <> -void caffe_copy(const int N, const float* X, float* Y) { - cblas_scopy(N, X, 1, Y, 1); -} - -template <> -void caffe_copy(const int N, const double* X, double* Y) { - cblas_dcopy(N, X, 1, Y, 1); -} - -//template -void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { - clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, - NULL, NULL); -// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); -} -/* - template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); - template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); - template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); - template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); - */ -template <> -void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) { - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, - N, 0, NULL, NULL)); -} - -template <> -void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) { - OCL_CHECK( - clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, - N, 0, NULL, NULL)); -} - -template <> -void caffe_gpu_copy(const int N, const float* X, float* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } -} - -template <> -void caffe_gpu_copy(const int N, const double* X, double* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } -} - -template <> -void caffe_scal(const int N, const float alpha, float *X) { - cblas_sscal(N, alpha, X, 1); -} - -template <> -void caffe_scal(const int N, const double alpha, double *X) { - cblas_dscal(N, alpha, X, 1); -} - -template <> -void caffe_gpu_scal(const int N, const float alpha, float *X) { - CLBLAS_CHECK( - clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); -} - -template <> -void caffe_gpu_scal(const int N, const double alpha, double *X) { - CLBLAS_CHECK( - clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, - NULL, NULL)); -} - -template <> -void caffe_gpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); -} - -template <> -void caffe_gpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - caffe_gpu_scal(N, beta, Y); - caffe_gpu_axpy(N, alpha, X, Y); -} - -template <> -void caffe_cpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - cblas_saxpby(N, alpha, X, 1, beta, Y, 1); -} - -template <> -void caffe_cpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - cblas_daxpby(N, alpha, X, 1, beta, Y, 1); -} - -template <> -void caffe_add(const int n, const float* a, const float* b, float* y) { - vsAdd(n, a, b, y); -} - -template <> -void caffe_add(const int n, const double* a, const double* b, - double* y) { - vdAdd(n, a, b, y); -} - -template <> -void caffe_sub(const int n, const float* a, const float* b, float* y) { - vsSub(n, a, b, y); -} - -template <> -void caffe_sub(const int n, const double* a, const double* b, - double* y) { - vdSub(n, a, b, y); -} - -template <> -void caffe_mul(const int n, const float* a, const float* b, float* y) { - vsMul(n, a, b, y); -} - -template <> -void caffe_mul(const int n, const double* a, const double* b, - double* y) { - vdMul(n, a, b, y); -} - -template <> -void caffe_div(const int n, const float* a, const float* b, float* y) { - vsDiv(n, a, b, y); -} - -template <> -void caffe_div(const int n, const double* a, const double* b, - double* y) { - vdDiv(n, a, b, y); -} - -template <> -void caffe_powx(const int n, const float* a, const float b, float* y) { - vsPowx(n, a, b, y); -} - -template <> -void caffe_powx(const int n, const double* a, const double b, - double* y) { - vdPowx(n, a, b, y); -} - -template <> -void caffe_sqr(const int n, const float* a, float* y) { - vsSqr(n, a, y); -} - -template <> -void caffe_sqr(const int n, const double* a, double* y) { - vdSqr(n, a, y); -} - -template <> -void caffe_exp(const int n, const float* a, float* y) { - vsExp(n, a, y); +void caffe_gpu_sgnbit(const int n, const float* x, float* y) { } template <> -void caffe_exp(const int n, const double* a, double* y) { - vdExp(n, a, y); +void caffe_gpu_sgnbit(const int n, const double* x, double* y) { } -unsigned int caffe_rng_rand() { - return (*caffe_rng())(); +template <> +void caffe_gpu_abs(const int n, const float* x, float* y) { + caffe_gpu_abs_ocl(n, x, y); } -template -Dtype caffe_nextafter(const Dtype b) { - return boost::math::nextafter < Dtype - > (b, std::numeric_limits < Dtype > ::max()); +template <> +void caffe_gpu_abs(const int n, const double* x, double* y) { + caffe_gpu_abs_ocl(n, x, y); } -template -float caffe_nextafter(const float b); -template -double caffe_nextafter(const double b); - -template -void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_LE(a, b); - boost::uniform_real < Dtype - > random_distribution(a, caffe_nextafter(b)); - boost::variate_generator > variate_generator( - caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); - } - - //LOG(INFO) << "caffe_rng_uniform"; +//template +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, + NULL, NULL); +// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); +} +/* + template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); + template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); + template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); + template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); + */ +template <> +void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); } -template -void caffe_rng_uniform(const int n, const float a, const float b, - float* r); - -template -void caffe_rng_uniform(const int n, const double a, const double b, - double* r); +template <> +void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); +} -template -void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma, - Dtype* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GT(sigma, 0); - boost::normal_distribution < Dtype > random_distribution(a, sigma); - boost::variate_generator > variate_generator( - caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); +template <> +void caffe_gpu_copy(const int N, const float* X, float* Y) { + if (X != Y) { + CLBLAS_CHECK( + clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } } -template -void caffe_rng_gaussian(const int n, const float mu, const float sigma, - float* r); - -template -void caffe_rng_gaussian(const int n, const double mu, - const double sigma, double* r); - -template -void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution < Dtype > random_distribution(p); - boost::variate_generator > variate_generator( - caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = variate_generator(); +template <> +void caffe_gpu_copy(const int N, const double* X, double* Y) { + if (X != Y) { + CLBLAS_CHECK( + clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); } } -template -void caffe_rng_bernoulli(const int n, const double p, int* r); - -template -void caffe_rng_bernoulli(const int n, const float p, int* r); - -template -void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { - CHECK_GE(n, 0); - CHECK(r); - CHECK_GE(p, 0); - CHECK_LE(p, 1); - boost::bernoulli_distribution < Dtype > random_distribution(p); - boost::variate_generator > variate_generator( - caffe_rng(), random_distribution); - for (int i = 0; i < n; ++i) { - r[i] = static_cast(variate_generator()); - } +template <> +void caffe_gpu_scal(const int N, const float alpha, float *X) { + CLBLAS_CHECK( + clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); } -template -void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); +template <> +void caffe_gpu_scal(const int N, const double alpha, double *X) { + CLBLAS_CHECK( + clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); +} -template -void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); -// template <> -float caffe_cpu_dot(const int n, const float* x, const float* y) { - return cblas_sdot(n, x, 1, y, 1); +void caffe_gpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> -double caffe_cpu_dot(const int n, const double* x, const double* y) { - return cblas_ddot(n, x, 1, y, 1); +void caffe_gpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); } template <> @@ -653,38 +761,6 @@ void caffe_gpu_dot(const int n, const double* x, const double* y, clReleaseMemObject(d_out); } -template <> -int caffe_cpu_hamming_distance(const int n, const float* x, - const float* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcount( - static_cast(x[i]) ^ static_cast(y[i])); - } - return dist; -} - -template <> -int caffe_cpu_hamming_distance(const int n, const double* x, - const double* y) { - int dist = 0; - for (int i = 0; i < n; ++i) { - dist += __builtin_popcountl( - static_cast(x[i]) ^ static_cast(y[i])); - } - return dist; -} - -template <> -float caffe_cpu_asum(const int n, const float* x) { - return cblas_sasum(n, x, 1); -} - -template <> -double caffe_cpu_asum(const int n, const double* x) { - return cblas_dasum(n, x, 1); -} - template <> void caffe_gpu_asum(const int n, const float* x, float* y) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, @@ -713,27 +789,7 @@ void caffe_gpu_asum(const int n, const double* x, double* y) { clReleaseMemObject(d_y); } -//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) -// - (x[index] < Dtype(0))); -//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); - -INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign); -INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit); -INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); - -template <> -void caffe_cpu_scale(const int n, const float alpha, const float *x, - float* y) { - cblas_scopy(n, x, 1, y, 1); - cblas_sscal(n, alpha, y, 1); -} -template <> -void caffe_cpu_scale(const int n, const double alpha, const double *x, - double* y) { - cblas_dcopy(n, x, 1, y, 1); - cblas_dscal(n, alpha, y, 1); -} template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, @@ -902,48 +958,6 @@ void caffe_gpu_log(const int N, const double* a, double* y) { kernel_log(N, a, y); } -template <> -void caffe_log(const int n, const float* a, float* y) { - vsLn(n, a, y); -} - -template <> -void caffe_log(const int n, const double* a, double* y) { - vdLn(n, a, y); -} - -template -void caffe_copy(const int N, const Dtype* X, Dtype* Y) { - if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); -#else - NO_GPU; -#endif - } else { - memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } - } -} - -template void caffe_copy(const int N, const int* X, int* Y); -template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); -template void caffe_copy(const int N, const float* X, float* Y); -template void caffe_copy(const int N, const double* X, double* Y); - -template <> -void caffe_abs(const int n, const float* a, float* y) { - vsAbs(n, a, y); -} - -template <> -void caffe_abs(const int n, const double* a, double* y) { - vdAbs(n, a, y); -} - template <> void caffe_gpu_add(const int N, const float* a, const float* b, float* y) { @@ -957,32 +971,5 @@ void caffe_gpu_add(const int N, const double* a, const double* b, // NOLINT_NEXT_LINE(whitespace/operators) kernel_add(N, a, b, y); } - -template <> -float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { - return cblas_sdot(n, x, incx, y, incy); -} - -template <> -double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { - return cblas_ddot(n, x, incx, y, incy); -} - -template -void caffe_set(const int N, const Dtype alpha, Dtype* Y) { - if (alpha == 0) { - memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - return; - } - for (int i = 0; i < N; ++i) { - Y[i] = alpha; - } -} - -template void caffe_set(const int N, const int alpha, int* Y); -template void caffe_set(const int N, const float alpha, float* Y); -template void caffe_set(const int N, const double alpha, double* Y); - +#endif } // namespace caffe diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 6b5045d8..1123e2b3 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -32,6 +32,9 @@ #include "caffe/common.hpp" #include "caffe/util/ocl_util.hpp" namespace caffe { + +#ifndef CPU_ONLY + template extern std::string get_dtype_suffix(); template @@ -88,4 +91,5 @@ void eventCallback(cl_event event, cl_int event_status, void* user_data) { printf("The kernel's running time is %f s\n", run_time * 1.0e-9); } +#endif } // namespace caffe diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index fcc2479e..53ebb751 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -33,6 +33,8 @@ #include "caffe/util/ocl_util.hpp" #include "caffe/util/ocl_wrapper.hpp" namespace caffe { + +#ifndef CPU_ONLY typedef unsigned int uint32_t; struct array4x32 { uint32_t v[4]; @@ -1929,5 +1931,8 @@ template void ocl_conv(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz); + +#endif + } // namespace caffe From fd94a965ea9d478e84e9c63eb66a850b5302b8f0 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Sun, 13 Sep 2015 18:36:44 -0700 Subject: [PATCH 102/124] add find path for AMDAPPSDK3.0 and addes src/caffe/CMakeLists.txt --- cmake/Modules/FindOpenCL.cmake | 2 +- src/caffe/CMakeLists.txt | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 src/caffe/CMakeLists.txt diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake index 7c23701d..93abd4f9 100644 --- a/cmake/Modules/FindOpenCL.cmake +++ b/cmake/Modules/FindOpenCL.cmake @@ -75,7 +75,7 @@ if( LIB64 ) $ENV{AMDAPPSDKROOT}/lib $ENV{CUDA_PATH}/lib DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86_64 x64 + PATH_SUFFIXES x86_64 x64 x86_64/sdk PATHS /usr/lib /usr/local/cuda/lib diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt new file mode 100644 index 00000000..3e675c20 --- /dev/null +++ b/src/caffe/CMakeLists.txt @@ -0,0 +1,34 @@ +# generate protobuf sources +file(GLOB proto_files proto/*.proto) +caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files}) + +# include python files either to force generation +add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python}) +set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend! +caffe_default_properties(proto) + +# --[ Caffe library + +# creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists +caffe_pickup_caffe_sources(${PROJECT_SOURCE_DIR}) + +if(HAVE_CUDA) + caffe_cuda_compile(cuda_objs ${cuda}) + list(APPEND srcs ${cuda_objs} ${cuda}) +endif() + +add_library(caffe ${srcs}) +target_link_libraries(caffe proto ${Caffe_LINKER_LIBS}) +caffe_default_properties(caffe) + +# ---[ Tests + add_subdirectory(test) + +# ---[ Install +install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include) +install(FILES ${proto_hdrs} DESTINATION include/caffe/proto) +install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib) + +file(WRITE ${PROJECT_BINARY_DIR}/__init__.py) +list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py) +install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto) From 1a2f0222dc7f384800c94d72493f42e62e01b0f8 Mon Sep 17 00:00:00 2001 From: Yibing Date: Mon, 14 Sep 2015 11:23:08 +0800 Subject: [PATCH 103/124] Add the change in tools/ --- tools/caffe.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/caffe.cpp b/tools/caffe.cpp index d7953bdd..79b8e127 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -16,7 +16,9 @@ using caffe::Layer; using caffe::shared_ptr; using caffe::Timer; using caffe::vector; +#ifndef CPU_ONLY using caffe::amdDevice; +#endif DEFINE_int32(gpu, -1, "Run in GPU mode on given device ID."); @@ -247,9 +249,9 @@ int time() { std::vector backward_time_per_layer(layers.size(), 0.0); double forward_time = 0.0; double backward_time = 0.0; - +#ifndef CPU_ONLY clFinish(amdDevice.CommandQueue); - +#endif for (int j = 0; j < FLAGS_iterations; ++j) { Timer iter_timer; iter_timer.Start(); @@ -257,9 +259,9 @@ int time() { for (int i = 0; i < layers.size(); ++i) { timer.Start(); layers[i]->Forward(bottom_vecs[i], top_vecs[i]); - +#ifndef CPU_ONLY clFinish(amdDevice.CommandQueue); - +#endif forward_time_per_layer[i] += timer.MicroSeconds(); } forward_time += forward_timer.MicroSeconds(); @@ -268,9 +270,9 @@ int time() { timer.Start(); layers[i]->Backward(top_vecs[i], bottom_need_backward[i], bottom_vecs[i]); - +#ifndef CPU_ONLY clFinish(amdDevice.CommandQueue); - +#endif backward_time_per_layer[i] += timer.MicroSeconds(); } backward_time += backward_timer.MicroSeconds(); From a3d5b15e52514d8a21b7dfcd03b4d9c89cc4a00e Mon Sep 17 00:00:00 2001 From: Yibing Date: Mon, 14 Sep 2015 11:48:06 +0800 Subject: [PATCH 104/124] Clean test code --- src/caffe/test/test_caffe_main.cpp | 16 ---------------- src/caffe/test/test_inner_product_layer.cpp | 8 -------- 2 files changed, 24 deletions(-) diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp index 278d520c..32643b3b 100644 --- a/src/caffe/test/test_caffe_main.cpp +++ b/src/caffe/test/test_caffe_main.cpp @@ -5,16 +5,6 @@ #include "caffe/common.hpp" #include "caffe/test/test_caffe_main.hpp" -namespace caffe { -#ifndef CPU_ONLY - //cudaDeviceProp CAFFE_TEST_CUDA_PROP; -#endif -} - -#ifndef CPU_ONLY -//using caffe::CAFFE_TEST_CUDA_PROP; - -#endif int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); @@ -22,22 +12,16 @@ int main(int argc, char** argv) { #ifndef CPU_ONLY // Before starting testing, let's first print out a few cuda defice info. int device = 0; -// cudaGetDeviceCount(&device); - // cout << "Cuda number of devices: " << device << endl; if (argc > 1) { // Use the given device device = atoi(argv[1]); - // cudaSetDevice(device); caffe::amdDevice.Init(device); cout << "Setting to use device " << device << endl; } else if (OPENCL_TEST_DEVICE >= 0) { // Use the device assigned in build configuration; but with a lower priority device = OPENCL_TEST_DEVICE; } -// cudaGetDevice(&device); cout << "Current device id: " << device << endl; - // cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device); -// caffe::set_mode(caffe::GPU); caffe::amdDevice.Init(); #endif // invoke the test. diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp index f0c36b13..7913b49c 100644 --- a/src/caffe/test/test_inner_product_layer.cpp +++ b/src/caffe/test/test_inner_product_layer.cpp @@ -57,10 +57,6 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) { TYPED_TEST(InnerProductLayerTest, TestForward) { typedef typename TypeParam::Dtype Dtype; - bool IS_VALID_CUDA = false; -#ifndef CPU_ONLY - // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; -#endif if (Caffe::mode() == Caffe::CPU || sizeof(Dtype) == 4 ) { LayerParameter layer_param; @@ -87,10 +83,6 @@ TYPED_TEST(InnerProductLayerTest, TestForward) { TYPED_TEST(InnerProductLayerTest, TestGradient) { typedef typename TypeParam::Dtype Dtype; - bool IS_VALID_CUDA = false; -#ifndef CPU_ONLY - // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; -#endif if (Caffe::mode() == Caffe::CPU || sizeof(Dtype) == 4 ) { LayerParameter layer_param; From aef701ce1d71601c539bbb2d67f8683f6f5d21c7 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 16 Sep 2015 00:23:50 +0800 Subject: [PATCH 105/124] Passed PReLU layer's unit test --- src/caffe/ocl/prelu_layer.cl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl index 5e8c521f..caff18b9 100644 --- a/src/caffe/ocl/prelu_layer.cl +++ b/src/caffe/ocl/prelu_layer.cl @@ -48,13 +48,13 @@ template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLU template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); template -__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) { +__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_in_diff, __global T* in_data, const int offset_in_data, __global T* out_diff) { int index = get_global_id(0); if(index < count) { - in_diff += offset_out; - out_diff += offset_in; + in_diff += offset_in_diff; + in_data += offset_in_data; out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); } } -template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff); -template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_in_diff, __global float* in_data, const int offset_in_data, __global float* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_in_diff, __global double* in_data, const int offset_in_data, __global double* out_diff); From c1102c3fd0e9df307c2a21a0f9d1864d6a896193 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 16 Sep 2015 01:10:30 +0800 Subject: [PATCH 106/124] Passed through Slice layer --- include/caffe/util/ocl_wrapper.hpp | 6 +++++ src/caffe/layers/slice_layer.cpp | 36 ++++++++++++++++++++++++++++-- src/caffe/ocl/slice_layer.cl | 28 +++++++++++++++++++++++ src/caffe/util/ocl_wrapper.cpp | 35 +++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 src/caffe/ocl/slice_layer.cl diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index a1d11d18..146567ea 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -340,6 +340,12 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a, template void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, const int* mask, Dtype* bottom_diff); + +template +void Slice(const int nthreads, const Dtype* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, Dtype* out_data); #endif } #endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index 8263b92b..cd19fdb5 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -110,17 +110,49 @@ void SliceLayer::Backward_cpu(const vector*>& top, offset_slice_axis += top_slice_axis; } } + #ifndef CPU_ONLY template void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { + if (top.size() == 1) { return; } + int offset_slice_axis = 0; + const Dtype* bottom_data = bottom[0]->gpu_data(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + const bool kForward = true; + for (int i = 0; i < top.size(); ++i) { + Dtype* top_data = top[i]->mutable_gpu_data(); + const int top_slice_axis = top[i]->shape(slice_axis_); + const int top_slice_size = top_slice_axis * slice_size_; + const int nthreads = top_slice_size * num_slices_; + Slice // NOLINT_NEXT_LINE(whitespace/operators) + (nthreads, bottom_data, kForward, num_slices_, slice_size_, + bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data); + offset_slice_axis += top_slice_axis; + } } template void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0] || top.size() == 1) { return; } + int offset_slice_axis = 0; + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + const bool kForward = false; + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const int top_slice_axis = top[i]->shape(slice_axis_); + const int top_slice_size = top_slice_axis * slice_size_; + const int nthreads = top_slice_size * num_slices_; + Slice // NOLINT_NEXT_LINE(whitespace/operators) + (nthreads, top_diff, kForward, num_slices_, slice_size_, + bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff); + offset_slice_axis += top_slice_axis; + } } + #else STUB_GPU(SliceLayer); #endif diff --git a/src/caffe/ocl/slice_layer.cl b/src/caffe/ocl/slice_layer.cl new file mode 100644 index 00000000..26c6bb34 --- /dev/null +++ b/src/caffe/ocl/slice_layer.cl @@ -0,0 +1,28 @@ +template +__kernel void Slice(const int nthreads, __global const Dtype* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global Dtype* out_data) { + int index = get_global_id(0); + if (index < nthreads) { + const int total_slice_size = slice_size * top_slice_axis; + const int slice_num = index / total_slice_size; + const int slice_index = index % total_slice_size; + const int bottom_index = slice_index + + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; + if (forward) { + out_data[index] = in_data[bottom_index]; + } else { + out_data[bottom_index] = in_data[index]; + } + } +} + +template __attribute__ ((mangled_name(Slice_float))) __kernel void Slice(const int nthreads, __global const float* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global float* out_data); +template __attribute__ ((mangled_name(Slice_double))) __kernel void Slice(const int nthreads, __global const double* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global double* out_data); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 53ebb751..8c35e719 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1917,6 +1917,41 @@ template void MaxBackward(const int nthreads, const float* top_diff, template void MaxBackward(const int nthreads, const double* top_diff, const int blob_idx, const int* mask, double* bottom_diff); +template +void Slice(const int nthreads, const Dtype* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, Dtype* out_data) { + std::string kernel_name = "Slice" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true) ? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_slices); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &slice_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &bottom_slice_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &top_slice_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_slice_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Slice(const int nthreads, const float* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, float* out_data); +template void Slice(const int nthreads, const double* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, double* out_data); + template void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, From 8b433d14255da4625a25f3af42ae7b48a79a79d9 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 16 Sep 2015 14:11:48 +0800 Subject: [PATCH 107/124] Passed through Im2col_layer test --- src/caffe/layers/im2col_layer.cpp | 1 + src/caffe/util/im2col.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index f51fd7cc..38e1fd20 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -103,6 +103,7 @@ void Im2colLayer::Forward_gpu(const vector*>& bottom, template void Im2colLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index ab023e70..89985534 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -196,8 +196,8 @@ template void im2col_gpu(const double* data_im, const int img_offset, const int stride_w, double* data_col, const int col_offset); template -void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, - const int width, const int channels, const int patch_h, const int patch_w, +void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset) { std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); @@ -233,11 +233,11 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, } template void col2im_gpu(const float* data_col, const int col_offset, - const int height, const int width, const int channels, const int patch_h, + const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_im, const int img_offset); template void col2im_gpu(const double* data_col, const int col_offset, - const int height, const int width, const int channels, const int patch_h, + const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im, const int img_offset); From 1ec6e88956d065c504e1eaeff1e5a8ba0c4dcb4d Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Tue, 15 Sep 2015 23:12:05 -0700 Subject: [PATCH 108/124] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index dd3933e6..56786f8f 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,10 @@ We will keep updating the latest performance as we make optimizations. Fury resu For more information on how to install, use or contribute to this code base, please visit our wiki page: https://github.com/amd/OpenCL-caffe/wiki +#Contributors +Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu +We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. + #Support needed As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together. From c5eeb40515d1a9fa8fdaf7d8ab413f0ab7fa97b8 Mon Sep 17 00:00:00 2001 From: Yibing Date: Wed, 16 Sep 2015 22:23:39 +0800 Subject: [PATCH 109/124] Tested reduction_layer & deconv_layer --- include/caffe/util/math_functions.hpp | 19 ++++- include/caffe/util/ocl_util.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 3 + src/caffe/layers/deconv_layer.cpp | 18 ++-- src/caffe/layers/reduction_layer.cpp | 25 +++--- src/caffe/ocl/util.cl | 21 ++++- src/caffe/util/math_functions.cpp | 117 ++++++++++++++++++++++++-- src/caffe/util/ocl_util.cpp | 11 ++- src/caffe/util/ocl_wrapper.cpp | 22 +++++ 9 files changed, 197 insertions(+), 41 deletions(-) diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 7178ea74..4ca1fac0 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -107,7 +107,7 @@ template void caffe_set(const int N, const Dtype alpha, Dtype *X); template -void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X, const int offset=0); inline void caffe_memset(const size_t N, const int alpha, void* X) { memset(X, alpha, N); // NOLINT(caffe/alt_fn) @@ -127,6 +127,9 @@ void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y); template void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); +template +void caffe_gpu_copy(const int N, const Dtype* X, const int offx, Dtype* Y, const int offy); + template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); @@ -141,7 +144,7 @@ template void caffe_scal(const int N, const Dtype alpha, Dtype *X); template -void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, const int offx = 0); template void caffe_sqr(const int N, const Dtype* a, Dtype* y); @@ -222,6 +225,9 @@ Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); template void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); +template +void caffe_gpu_dot(const int n, const Dtype* x, size_t offx, const Dtype* y, size_t offy, Dtype* out); + template int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); @@ -236,6 +242,9 @@ Dtype caffe_cpu_asum(const int n, const Dtype* x); template void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); +template +void caffe_gpu_asum(const int n, const Dtype* x, size_t offx, Dtype* y); + // the branchless, type-safe version from // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c template @@ -282,6 +291,9 @@ DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); template void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); +template +void caffe_gpu_sign(const int N, const Dtype *X, const int offx, Dtype *Y, const int offy); + // This returns a nonzero value if the input has its sign bit set. // The name sngbit is meant to avoid conflicts with std::signbit in the macro using std::signbit; @@ -301,6 +313,9 @@ void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); +template +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, const int offx, Dtype* y, const int offy); + template void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index 00bfa3cf..dcdf1057 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -30,7 +30,7 @@ namespace caffe { #ifndef CPU_ONLY template -void ocl_memset(Dtype* buffer, const Dtype value, const int count); +void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count); diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 146567ea..61d6162e 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -229,6 +229,9 @@ void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); +template +void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx, Dtype * Y, const int offy); + template void kernel_channel_max(const int num, const int channels, const int spatial_dim, const Dtype* data, Dtype* out); diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index 2504f43a..5b0eeb03 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -79,10 +79,11 @@ void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* top_data = top[i]->mutable_gpu_data(); for (int n = 0; n < this->num_; ++n) { - this->bottom_offset_ = bottom[i]->offset(n); - this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = top[i]->offset(n); + this->top_offset_ = bottom[i]->offset(n); this->backward_gpu_gemm(bottom_data, weight, top_data); if (this->bias_term_) { + this->top_offset_ = top[i]->offset(n); const Dtype* bias = this->blobs_[1]->gpu_data(); this->forward_gpu_bias(top_data, bias); } @@ -104,23 +105,20 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); for (int n = 0; n < this->num_; ++n) { this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); + this->backward_gpu_bias(bias_diff, top_diff); } } if (this->param_propagate_down_[0] || propagate_down[i]) { for (int n = 0; n < this->num_; ++n) { - this->top_offset_ = top[i]->offset(n); - this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = bottom[i]->offset(n); + this->bottom_offset_ = top[i]->offset(n); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); + this->weight_gpu_gemm(top_diff, bottom_data, weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); + this->forward_gpu_gemm(top_diff, weight, bottom_diff); } } } diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 9ec057b1..0358d83a 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -128,29 +128,32 @@ void ReductionLayer::Backward_cpu(const vector*>& top, template void ReductionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { + //Forward_cpu(bottom, top); +//return; const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* mult_data = NULL; if (sum_multiplier_.count() > 0) { mult_data = sum_multiplier_.gpu_data(); } Dtype* top_data = top[0]->mutable_cpu_data(); + size_t bottom_offset = 0; for (int i = 0; i < num_; ++i) { switch (op_) { case ReductionParameter_ReductionOp_SUM: case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); + caffe_gpu_dot(dim_, mult_data, 0, bottom_data, bottom_offset, top_data); break; case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); + caffe_gpu_asum(dim_, bottom_data, bottom_offset, top_data); break; case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); + caffe_gpu_dot(dim_, bottom_data, bottom_offset, bottom_data, bottom_offset, top_data); break; default: LOG(FATAL) << "Unknown reduction op: " << ReductionParameter_ReductionOp_Name(op_); } - bottom_data += dim_; + bottom_offset += dim_; ++top_data; } if (coeff_ != Dtype(1)) { @@ -184,26 +187,28 @@ void ReductionLayer::Backward_gpu(const vector*>& top, } const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + int bottom_data_offset = 0; + int bottom_diff_offset = 0; for (int i = 0; i < num_; ++i) { const Dtype bottom_coeff = (*top_diff) * coeff_; switch (op_) { case ReductionParameter_ReductionOp_SUM: case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); + caffe_gpu_set(dim_, bottom_coeff, bottom_diff, bottom_diff_offset); break; case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); + caffe_gpu_sign(dim_, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff, bottom_diff_offset); break; case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset); break; default: LOG(FATAL) << "Unknown reduction op: " << ReductionParameter_ReductionOp_Name(op_); } - bottom_data += dim_; - bottom_diff += dim_; + bottom_data_offset += dim_; + bottom_diff_offset += dim_; ++top_diff; } } diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index 576a6e98..eced284b 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -27,16 +27,17 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable template -__kernel void OCL_memset(__global T* buffer, const T value, const int size) { +__kernel void OCL_memset(__global T* buffer, const T value, const int size, const int buf_offset) { int gdx = get_global_id(0); + buffer += buf_offset; if(gdx < size) { buffer[gdx] = value; } } -template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size); -template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size); -template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size); +template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size, const int buf_offset); +template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size, const int buf_offset); +template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size, const int buf_offset); __kernel void OCL_memset2(__global int* buffer, const int value, const int size) { int gdx = get_global_id(0); @@ -56,6 +57,18 @@ __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) { template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y); +template +__kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx, __global T* Y, const int offy) { + X += offx; + Y += offy; + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); + } +} +template __attribute__((mangled_name(caffe_gpu_sign_with_offset_float))) __kernel void caffe_gpu_sign_with_offset(const int N, __global float* X, const int offx, __global float* Y, const int offy); +template __attribute__((mangled_name(caffe_gpu_sign_with_offset_double))) __kernel void caffe_gpu_sign_with_offset(const int N, __global double* X, const int offx, __global double* Y, const int offy); + template __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) { int index = get_global_id(0); diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index d1cfc954..aebeb5ed 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -702,16 +702,34 @@ void caffe_gpu_copy(const int N, const double* X, double* Y) { } template <> -void caffe_gpu_scal(const int N, const float alpha, float *X) { +void caffe_gpu_copy(const int N, const float* X, const int offx, float* Y, const int offy) { + if (X != Y) { + CLBLAS_CHECK( + clblasScopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } +} + +template <> +void caffe_gpu_copy(const int N, const double* X, const int offx, double* Y, const int offy) { + if (X != Y) { + CLBLAS_CHECK( + clblasDcopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } +} + +template <> +void caffe_gpu_scal(const int N, const float alpha, float *X, const int offx) { CLBLAS_CHECK( - clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + clblasSscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } template <> -void caffe_gpu_scal(const int N, const double alpha, double *X) { +void caffe_gpu_scal(const int N, const double alpha, double *X, const int offx) { CLBLAS_CHECK( - clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0, + clblasDscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } @@ -761,6 +779,36 @@ void caffe_gpu_dot(const int n, const double* x, const double* y, clReleaseMemObject(d_out); } +template <> +void caffe_gpu_dot(const int n, const float* x, size_t offx, const float* y, size_t offy, float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(float)), NULL, NULL); + clblasSdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template <> +void caffe_gpu_dot(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) { + //need to pass in scratchBuff + //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(double)), NULL, NULL); + clblasDdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + template <> void caffe_gpu_asum(const int n, const float* x, float* y) { cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, @@ -789,6 +837,33 @@ void caffe_gpu_asum(const int n, const double* x, double* y) { clReleaseMemObject(d_y); } +template <> +void caffe_gpu_asum(const int n, const float* x, size_t offx, float* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_float)), NULL, NULL); + clblasSasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, + 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} + +template <> +void caffe_gpu_asum(const int n, const double* x, size_t offx, double* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_double)), NULL, NULL); + clblasDasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), + y, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} template <> @@ -805,18 +880,32 @@ void caffe_gpu_scale(const int n, const double alpha, const double *x, caffe_gpu_scal(n, alpha, y); } +template <> +void caffe_gpu_scale(const int n, const float alpha, const float *x, + const int offx, float* y, const int offy) { + caffe_gpu_copy(n, x, offx, y, offy); + caffe_gpu_scal(n, alpha, y, offy); +} + +template <> +void caffe_gpu_scale(const int n, const double alpha, const double *x, + const int offx, double* y, const int offy) { + caffe_gpu_copy(n, x, offx, y, offy); + caffe_gpu_scal(n, alpha, y, offy); +} + template void set_kernel(const int n, const Dtype alpha, Dtype* y) { } template <> -void caffe_gpu_set(const int N, const float alpha, float* Y) { - ocl_memset(Y, alpha, N); +void caffe_gpu_set(const int N, const float alpha, float* Y, const int offy) { + ocl_memset(Y, alpha, N, offy); } template <> -void caffe_gpu_set(const int N, const double alpha, double* Y) { - ocl_memset(Y, alpha, N); +void caffe_gpu_set(const int N, const double alpha, double* Y, const int offy) { + ocl_memset(Y, alpha, N, offy); } template <> @@ -844,11 +933,23 @@ void caffe_gpu_sign(const int N, const float *X, float *Y) { caffe_gpu_sign_ocl(N, X, Y); } + template <> void caffe_gpu_sign(const int N, const double *X, double *Y) { caffe_gpu_sign_ocl(N, X, Y); } +template <> +void caffe_gpu_sign(const int N, const float *X, const int offx, float *Y, const int offy) { + caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy); +} + + +template <> +void caffe_gpu_sign(const int N, const double *X, const int offx, double *Y, const int offy) { + caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy); +} + template <> void caffe_gpu_sub(const int N, const float* a, const float* b, float* y) { diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 1123e2b3..0b151e5a 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -38,13 +38,14 @@ namespace caffe { template extern std::string get_dtype_suffix(); template -void ocl_memset(Dtype* buffer, const Dtype value, const int count) { +void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset) { std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int err = 0; err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value); err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + err |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &buf_offset); OCL_CHECK(err); size_t Global_Work_Size[1] = { (size_t) count }; @@ -55,11 +56,9 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count) { } -template void ocl_memset(int* buffer, const int value, const int count); -template void ocl_memset(float* buffer, const float value, - const int count); -template void ocl_memset(double* buffer, const double value, - const int count); +template void ocl_memset(int* buffer, const int value, const int count, const int buf_offset); +template void ocl_memset(float* buffer, const float value, const int count, const int buf_offset); +template void ocl_memset(double* buffer, const double value, const int count, const int buf_offset); void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count) { diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 8c35e719..7ffadc72 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1584,6 +1584,28 @@ template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); template void caffe_gpu_sign_ocl(const int N, const double* X, double* Y); +template +void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx, Dtype * Y, const int offy) { + std::string kernel_name = "caffe_gpu_sign_with_offset" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offx); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &Y); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offy); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_with_offset_ocl(const int N, const float* X, const int offx, float* Y, const int offy); +template void caffe_gpu_sign_with_offset_ocl(const int N, const double* X, const int offx, double* Y, const int offy); + + template void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); From 6a46781617065b3be1c5ed7aa404c0898d15d436 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 16 Sep 2015 10:25:27 -0700 Subject: [PATCH 110/124] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 56786f8f..a1bf49d6 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ For more information on how to install, use or contribute to this code base, ple #Contributors Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu + We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. #Support needed From b0cb051c6944b9e4e5fe8c61c85c646aa1f82d3b Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 16 Sep 2015 11:17:14 -0700 Subject: [PATCH 111/124] update gitignore --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 434c7112..5a2ad423 100644 --- a/.gitignore +++ b/.gitignore @@ -92,5 +92,6 @@ LOG* CURRENT MANIFEST-* -#log files -log +#cmakefiles +src/caffe/test/CMakeFiles +src/caffe/CMakeFiles From e7db7b1cd7939f12af265f45b8b88aaa03672319 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 16 Sep 2015 11:20:53 -0700 Subject: [PATCH 112/124] untrack auto generated src/caffe/test/CMakeFiles --- .../CMakeDirectoryInformation.cmake | 16 - ...le_generated_test_im2col_kernel.cu.o.cmake | 296 --- ...e_generated_test_im2col_kernel.cu.o.depend | 1 - src/caffe/test/CMakeFiles/progress.marks | 1 - .../CMakeFiles/runtest.dir/DependInfo.cmake | 27 - .../test/CMakeFiles/runtest.dir/build.make | 69 - .../CMakeFiles/runtest.dir/cmake_clean.cmake | 8 - .../test/CMakeFiles/runtest.dir/progress.make | 1 - .../test.testbin.dir/DependInfo.cmake | 92 - .../CMakeFiles/test.testbin.dir/build.make | 1623 ----------------- .../test.testbin.dir/cmake_clean.cmake | 68 - .../CMakeFiles/test.testbin.dir/depend.make | 2 - .../CMakeFiles/test.testbin.dir/flags.make | 8 - .../test/CMakeFiles/test.testbin.dir/link.txt | 1 - .../CMakeFiles/test.testbin.dir/progress.make | 60 - 15 files changed, 2273 deletions(-) delete mode 100644 src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake delete mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake delete mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend delete mode 100644 src/caffe/test/CMakeFiles/progress.marks delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/build.make delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/progress.make delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/build.make delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/depend.make delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/flags.make delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/link.txt delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/progress.make diff --git a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake deleted file mode 100644 index 7bb0014c..00000000 --- a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake +++ /dev/null @@ -1,16 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# Relative path conversion top directories. -SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe") -SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe") - -# Force unix paths in dependencies. -SET(CMAKE_FORCE_UNIX_PATHS 1) - - -# The C and CXX include file regular expressions for this directory. -SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$") -SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$") -SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN}) -SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN}) diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake deleted file mode 100644 index 895d9fca..00000000 --- a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake +++ /dev/null @@ -1,296 +0,0 @@ -# James Bigler, NVIDIA Corp (nvidia.com - jbigler) -# -# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. -# -# This code is licensed under the MIT License. See the FindCUDA.cmake script -# for the text of the license. - -# The MIT License -# -# License for the specific language governing rights and limitations under -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -########################################################################## -# This file runs the nvcc commands to produce the desired output file along with -# the dependency file needed by CMake to compute dependencies. In addition the -# file checks the output of each command and if the command fails it deletes the -# output files. - -# Input variables -# -# verbose:BOOL=<> OFF: Be as quiet as possible (default) -# ON : Describe each step -# -# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or -# RelWithDebInfo, but it should match one of the -# entries in CUDA_HOST_FLAGS. This is the build -# configuration used when compiling the code. If -# blank or unspecified Debug is assumed as this is -# what CMake does. -# -# generated_file:STRING=<> File to generate. This argument must be passed in. -# -# generated_cubin_file:STRING=<> File to generate. This argument must be passed -# in if build_cubin is true. - -if(NOT generated_file) - message(FATAL_ERROR "You must specify generated_file on the command line") -endif() - -# Set these up as variables to make reading the generated file easier -set(CMAKE_COMMAND "/usr/bin/cmake") # path -set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_kernel.cu") # path -set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.NVCC-depend") # path -set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.depend") # path -set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path -set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path -set(build_cubin OFF) # bool -set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool -# We won't actually use these variables for now, but we need to set this, in -# order to force this file to be run again if it changes. -set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.") # path -set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o") # path -set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt") # path - -set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path -set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC;-Xcompiler;-fPIC ;; ) # list -# Build specific configuration flags -set(CUDA_NVCC_FLAGS_DEBUG ; ) -set(CUDA_NVCC_FLAGS_RELEASE ; ) -set(CUDA_NVCC_FLAGS_MINSIZEREL ; ) -set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ; ) -set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list -set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly). -set(format_flag "-c") # string - -if(build_cubin AND NOT generated_cubin_file) - message(FATAL_ERROR "You must specify generated_cubin_file on the command line") -endif() - -# This is the list of host compilation flags. It C or CXX should already have -# been chosen by FindCUDA.cmake. -set(CMAKE_HOST_FLAGS -fPIC -Wall -Wno-sign-compare -Wno-uninitialized ) -set(CMAKE_HOST_FLAGS_DEBUG -g) -set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG) -set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG) -set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG) - -# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler -set(nvcc_host_compiler_flags "") -# If we weren't given a build_configuration, use Debug. -if(NOT build_configuration) - set(build_configuration Debug) -endif() -string(TOUPPER "${build_configuration}" build_configuration) -#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") -foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) - # Extra quotes are added around each flag to help nvcc parse out flags with spaces. - set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") -endforeach() -if (nvcc_host_compiler_flags) - set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) -endif() -#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") -# Add the build specific configuration flags -list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) - -# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority -list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) -list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) -if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) - if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) - set(CCBIN -ccbin "${CCBIN}") - else() - set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") - endif() -endif() - -# cuda_execute_process - Executes a command with optional command echo and status message. -# -# status - Status message to print if verbose is true -# command - COMMAND argument from the usual execute_process argument structure -# ARGN - Remaining arguments are the command with arguments -# -# CUDA_result - return value from running the command -# -# Make this a macro instead of a function, so that things like RESULT_VARIABLE -# and other return variables are present after executing the process. -macro(cuda_execute_process status command) - set(_command ${command}) - if(NOT _command STREQUAL "COMMAND") - message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") - endif() - if(verbose) - execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) - # Now we need to build up our command string. We are accounting for quotes - # and spaces, anything else is left up to the user to fix if they want to - # copy and paste a runnable command line. - set(cuda_execute_process_string) - foreach(arg ${ARGN}) - # If there are quotes, excape them, so they come through. - string(REPLACE "\"" "\\\"" arg ${arg}) - # Args with spaces need quotes around them to get them to be parsed as a single argument. - if(arg MATCHES " ") - list(APPEND cuda_execute_process_string "\"${arg}\"") - else() - list(APPEND cuda_execute_process_string ${arg}) - endif() - endforeach() - # Echo the command - execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) - endif() - # Run the command - execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) -endmacro() - -# Delete the target file -cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - -# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag -# for dependency generation and hope for the best. -set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -set(CUDA_VERSION 6.5) -if(CUDA_VERSION VERSION_LESS "3.0") - cmake_policy(PUSH) - # CMake policy 0007 NEW states that empty list elements are not - # ignored. I'm just setting it to avoid the warning that's printed. - cmake_policy(SET CMP0007 NEW) - # Note that this will remove all occurances of -G. - list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") - cmake_policy(POP) -endif() - -# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This -# can cause incorrect dependencies when #including files based on this macro which is -# defined in the generating passes of nvcc invokation. We will go ahead and manually -# define this for now until a future version fixes this bug. -set(CUDACC_DEFINE -D__CUDACC__) - -# Generate the dependency file -cuda_execute_process( - "Generating dependency file: ${NVCC_generated_dependency_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - -M - ${CUDACC_DEFINE} - "${source_file}" - -o "${NVCC_generated_dependency_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${depends_CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the cmake readable dependency file to a temp file. Don't put the -# quotes just around the filenames for the input_file and output_file variables. -# CMake will pass the quotes through and not be able to find the file. -cuda_execute_process( - "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:FILEPATH=${NVCC_generated_dependency_file}" - -D "output_file:FILEPATH=${cmake_dependency_file}.tmp" - -P "${CUDA_make2cmake}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Copy the file if it is different -cuda_execute_process( - "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Delete the temporary file -cuda_execute_process( - "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" - ) - -if(CUDA_result) - message(FATAL_ERROR "Error generating ${generated_file}") -endif() - -# Generate the code -cuda_execute_process( - "Generating ${generated_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${format_flag} -o "${generated_file}" - ${CCBIN} - ${nvcc_flags} - ${nvcc_host_compiler_flags} - ${CUDA_NVCC_FLAGS} - -DNVCC - ${CUDA_NVCC_INCLUDE_ARGS} - ) - -if(CUDA_result) - # Since nvcc can sometimes leave half done files make sure that we delete the output file. - cuda_execute_process( - "Removing ${generated_file}" - COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" - ) - message(FATAL_ERROR "Error generating file ${generated_file}") -else() - if(verbose) - message("Generated ${generated_file} successfully.") - endif() -endif() - -# Cubin resource report commands. -if( build_cubin ) - # Run with -cubin to produce resource usage report. - cuda_execute_process( - "Generating ${generated_cubin_file}" - COMMAND "${CUDA_NVCC_EXECUTABLE}" - "${source_file}" - ${CUDA_NVCC_FLAGS} - ${nvcc_flags} - ${CCBIN} - ${nvcc_host_compiler_flags} - -DNVCC - -cubin - -o "${generated_cubin_file}" - ${CUDA_NVCC_INCLUDE_ARGS} - ) - - # Execute the parser script. - cuda_execute_process( - "Executing the parser script" - COMMAND "${CMAKE_COMMAND}" - -D "input_file:STRING=${generated_cubin_file}" - -P "${CUDA_parse_cubin}" - ) - -endif() diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend deleted file mode 100644 index 8e3a0be1..00000000 --- a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend +++ /dev/null @@ -1 +0,0 @@ -#FindCUDA.cmake generated file. Do not edit. diff --git a/src/caffe/test/CMakeFiles/progress.marks b/src/caffe/test/CMakeFiles/progress.marks deleted file mode 100644 index 573541ac..00000000 --- a/src/caffe/test/CMakeFiles/progress.marks +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake deleted file mode 100644 index f660fadf..00000000 --- a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# The set of languages for which implicit dependencies are needed: -SET(CMAKE_DEPENDS_LANGUAGES - ) -# The set of files for implicit dependencies of each language: - -# Preprocessor definitions for this target. -SET(CMAKE_TARGET_DEFINITIONS - "GTEST_USE_OWN_TR1_TUPLE" - ) - -# Targets to which this target links. -SET(CMAKE_TARGET_LINKED_INFO_FILES - ) - -# The include file search paths: -SET(CMAKE_C_TARGET_INCLUDE_PATH - "src" - "/usr/local/include" - "include" - "/usr/local/cuda/include" - "/usr/local/include/opencv" - "/usr/include/atlas" - "." - ) -SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/test/CMakeFiles/runtest.dir/build.make b/src/caffe/test/CMakeFiles/runtest.dir/build.make deleted file mode 100644 index 7ccc5279..00000000 --- a/src/caffe/test/CMakeFiles/runtest.dir/build.make +++ /dev/null @@ -1,69 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Remove some rules from gmake that .SUFFIXES does not remove. -SUFFIXES = - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E remove -f - -# Escaping for special characters. -EQUALS = = - -# The program to use to edit the cache. -CMAKE_EDIT_COMMAND = /usr/bin/ccmake - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# Utility rule file for runtest. - -# Include the progress variables for this target. -include src/caffe/test/CMakeFiles/runtest.dir/progress.make - -src/caffe/test/CMakeFiles/runtest: - /home/yugao/caffe-merge-junli/caffe-yb/caffe/test/test.testbin --gtest_shuffle - -runtest: src/caffe/test/CMakeFiles/runtest -runtest: src/caffe/test/CMakeFiles/runtest.dir/build.make -.PHONY : runtest - -# Rule to build all files generated by this target. -src/caffe/test/CMakeFiles/runtest.dir/build: runtest -.PHONY : src/caffe/test/CMakeFiles/runtest.dir/build - -src/caffe/test/CMakeFiles/runtest.dir/clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/runtest.dir/cmake_clean.cmake -.PHONY : src/caffe/test/CMakeFiles/runtest.dir/clean - -src/caffe/test/CMakeFiles/runtest.dir/depend: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake --color=$(COLOR) -.PHONY : src/caffe/test/CMakeFiles/runtest.dir/depend - diff --git a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake deleted file mode 100644 index ed560e60..00000000 --- a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FILE(REMOVE_RECURSE - "CMakeFiles/runtest" -) - -# Per-language clean rules from dependency scanning. -FOREACH(lang) - INCLUDE(CMakeFiles/runtest.dir/cmake_clean_${lang}.cmake OPTIONAL) -ENDFOREACH(lang) diff --git a/src/caffe/test/CMakeFiles/runtest.dir/progress.make b/src/caffe/test/CMakeFiles/runtest.dir/progress.make deleted file mode 100644 index 8b137891..00000000 --- a/src/caffe/test/CMakeFiles/runtest.dir/progress.make +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake deleted file mode 100644 index d4748b21..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake +++ /dev/null @@ -1,92 +0,0 @@ -# The set of languages for which implicit dependencies are needed: -SET(CMAKE_DEPENDS_LANGUAGES - "CXX" - ) -# The set of files for implicit dependencies of each language: -SET(CMAKE_DEPENDS_CHECK_CXX - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" - ) -SET(CMAKE_CXX_COMPILER_ID "GNU") - -# Preprocessor definitions for this target. -SET(CMAKE_TARGET_DEFINITIONS - "GTEST_USE_OWN_TR1_TUPLE" - ) - -# Targets to which this target links. -SET(CMAKE_TARGET_LINKED_INFO_FILES - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake" - "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake" - ) - -# The include file search paths: -SET(CMAKE_C_TARGET_INCLUDE_PATH - "src" - "/usr/local/include" - "include" - "/usr/local/cuda/include" - "/usr/local/include/opencv" - "/usr/include/atlas" - "." - ) -SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) -SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH}) diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make deleted file mode 100644 index c67def36..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make +++ /dev/null @@ -1,1623 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Remove some rules from gmake that .SUFFIXES does not remove. -SUFFIXES = - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E remove -f - -# Escaping for special characters. -EQUALS = = - -# The program to use to edit the cache. -CMAKE_EDIT_COMMAND = /usr/bin/ccmake - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# Include any dependencies generated for this target. -include src/caffe/test/CMakeFiles/test.testbin.dir/depend.make - -# Include the progress variables for this target. -include src/caffe/test/CMakeFiles/test.testbin.dir/progress.make - -# Include the compile flags for this target's objects. -include src/caffe/test/CMakeFiles/test.testbin.dir/flags.make - -src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend -src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake -src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/test_im2col_kernel.cu - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//. - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.cmake - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/test_spp_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp > CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/test_filler.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filler.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filler.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp > CMakeFiles/test.testbin.dir/test_filler.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filler.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp -o CMakeFiles/test.testbin.dir/test_filler.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/test_im2col_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp > CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/test_common.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_common.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp > CMakeFiles/test.testbin.dir/test_common.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_common.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp -o CMakeFiles/test.testbin.dir/test_common.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/test_infogain_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/test_math_functions.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_math_functions.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp > CMakeFiles/test.testbin.dir/test_math_functions.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_math_functions.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/test_euclidean_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/test_split_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_split_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp > CMakeFiles/test.testbin.dir/test_split_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_split_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/test_reshape_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp > CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/test_random_number_generator.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp > CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/test_lrn_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp > CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/test_gradient_based_solver.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp > CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/test_upgrade_proto.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp > CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/test_io.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_io.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp > CMakeFiles/test.testbin.dir/test_io.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_io.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp -o CMakeFiles/test.testbin.dir/test_io.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/test_accuracy_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp > CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/test_caffe_main.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp > CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/test_net.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_net.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp > CMakeFiles/test.testbin.dir/test_net.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_net.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp -o CMakeFiles/test.testbin.dir/test_net.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/test_filter_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp > CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/test_power_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_power_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp > CMakeFiles/test.testbin.dir/test_power_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_power_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/test_softmax_with_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/test_argmax_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp > CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/test_solver.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_solver.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp > CMakeFiles/test.testbin.dir/test_solver.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_solver.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp -o CMakeFiles/test.testbin.dir/test_solver.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/test_blob.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_blob.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp > CMakeFiles/test.testbin.dir/test_blob.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_blob.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp -o CMakeFiles/test.testbin.dir/test_blob.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/test_benchmark.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_benchmark.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp > CMakeFiles/test.testbin.dir/test_benchmark.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_benchmark.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/test_multinomial_logistic_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/test_util_blas.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_util_blas.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp > CMakeFiles/test.testbin.dir/test_util_blas.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_util_blas.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/test_internal_thread.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp > CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/test_reduction_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp > CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/test_contrastive_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/test_eltwise_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp > CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/test_maxpool_dropout_layers.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp > CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/test_threshold_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp > CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/test_pooling_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp > CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/test_softmax_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/test_inner_product_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp > CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/test_flatten_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp > CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/test_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp > CMakeFiles/test.testbin.dir/test_data_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/test_syncedmem.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp > CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/test_hdf5data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/test_deconvolution_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp > CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/test_neuron_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp > CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/test_concat_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp > CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/test_protobuf.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_protobuf.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp > CMakeFiles/test.testbin.dir/test_protobuf.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_protobuf.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/test_hdf5_output_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/test_memory_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp > CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/test_tanh_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp > CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/test_stochastic_pooling.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp > CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/test_dummy_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp > CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/test_layer_factory.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp > CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/test_db.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_db.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp > CMakeFiles/test.testbin.dir/test_db.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_db.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp -o CMakeFiles/test.testbin.dir/test_db.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/test_mvn_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp > CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/test_convolution_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp > CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/test_slice_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp > CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/test_hinge_loss_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/test_image_data_layer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp > CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/test_platform.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_platform.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_platform.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp > CMakeFiles/test.testbin.dir/test_platform.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_platform.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp -o CMakeFiles/test.testbin.dir/test_platform.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/test_data_transformer.cpp - $(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59) - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp > CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires: -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires - $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides - -src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o - -# Object files for target test.testbin -test_testbin_OBJECTS = \ -"CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_filler.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_common.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_io.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_net.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_solver.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_blob.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_db.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_platform.cpp.o" \ -"CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" - -# External object files for target test.testbin -test_testbin_EXTERNAL_OBJECTS = \ -"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o" - -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o -test/test.testbin: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/build.make -test/test.testbin: lib/libgtest.a -test/test.testbin: lib/libcaffe.so -test/test.testbin: lib/libproto.a -test/test.testbin: /usr/local/lib/libboost_system.so -test/test.testbin: /usr/local/lib/libboost_thread.so -test/test.testbin: /usr/lib/x86_64-linux-gnu/libpthread.so -test/test.testbin: /usr/local/lib/libglog.so -test/test.testbin: /usr/local/lib/libgflags.a -test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so -test/test.testbin: /usr/local/lib/libglog.so -test/test.testbin: /usr/local/lib/libgflags.a -test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so -test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so -test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5.so -test/test.testbin: /usr/local/lib/liblmdb.so -test/test.testbin: /usr/lib/x86_64-linux-gnu/libleveldb.so -test/test.testbin: /usr/lib/libsnappy.so -test/test.testbin: /usr/local/cuda/lib64/libcudart.so -test/test.testbin: /usr/local/cuda/lib64/libcurand.so -test/test.testbin: /usr/local/cuda/lib64/libcublas.so -test/test.testbin: /usr/local/lib/libopencv_highgui.so.2.4.10 -test/test.testbin: /usr/local/lib/libopencv_imgproc.so.2.4.10 -test/test.testbin: /usr/local/lib/libopencv_core.so.2.4.10 -test/test.testbin: /usr/lib/liblapack_atlas.so -test/test.testbin: /usr/lib/libcblas.so -test/test.testbin: /usr/lib/libatlas.so -test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/link.txt - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX executable ../../../test/test.testbin" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/test.testbin.dir/link.txt --verbose=$(VERBOSE) - -# Rule to build all files generated by this target. -src/caffe/test/CMakeFiles/test.testbin.dir/build: test/test.testbin -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/build - -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires -src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/requires - -src/caffe/test/CMakeFiles/test.testbin.dir/clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/test.testbin.dir/cmake_clean.cmake -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/clean - -src/caffe/test/CMakeFiles/test.testbin.dir/depend: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake --color=$(COLOR) -.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/depend - diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake deleted file mode 100644 index 3270b673..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake +++ /dev/null @@ -1,68 +0,0 @@ -FILE(REMOVE_RECURSE - "CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o" - "CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_filler.cpp.o" - "CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_common.cpp.o" - "CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" - "CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" - "CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" - "CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" - "CMakeFiles/test.testbin.dir/test_io.cpp.o" - "CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" - "CMakeFiles/test.testbin.dir/test_net.cpp.o" - "CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_solver.cpp.o" - "CMakeFiles/test.testbin.dir/test_blob.cpp.o" - "CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" - "CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" - "CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" - "CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" - "CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" - "CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" - "CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" - "CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" - "CMakeFiles/test.testbin.dir/test_db.cpp.o" - "CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" - "CMakeFiles/test.testbin.dir/test_platform.cpp.o" - "CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o" - "../../../test/test.testbin.pdb" - "../../../test/test.testbin" -) - -# Per-language clean rules from dependency scanning. -FOREACH(lang CXX) - INCLUDE(CMakeFiles/test.testbin.dir/cmake_clean_${lang}.cmake OPTIONAL) -ENDFOREACH(lang) diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make deleted file mode 100644 index e3607644..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make +++ /dev/null @@ -1,2 +0,0 @@ -# Empty dependencies file for test.testbin. -# This may be replaced when dependencies are built. diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make deleted file mode 100644 index 8b4ef992..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make +++ /dev/null @@ -1,8 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# compile CXX with /usr/bin/c++ -CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe - -CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE - diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt deleted file mode 100644 index 35426fa4..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt +++ /dev/null @@ -1 +0,0 @@ -/usr/bin/c++ -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o CMakeFiles/test.testbin.dir/test_filler.cpp.o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o CMakeFiles/test.testbin.dir/test_common.cpp.o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o CMakeFiles/test.testbin.dir/test_io.cpp.o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o CMakeFiles/test.testbin.dir/test_net.cpp.o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_solver.cpp.o CMakeFiles/test.testbin.dir/test_blob.cpp.o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o CMakeFiles/test.testbin.dir/test_db.cpp.o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_platform.cpp.o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o -o ../../../test/test.testbin -L/usr/local/cuda/lib64 -L/usr/local/lib -rdynamic ../../../lib/libgtest.a -Wl,--whole-archive ../../../lib/libcaffe.so -Wl,--no-whole-archive ../../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 /usr/local/lib/libopencv_core.so.2.4.10 -llapack_atlas -lcblas -latlas -Wl,-rpath,/usr/local/cuda/lib64:/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib:/usr/local/lib diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make deleted file mode 100644 index 9de70a55..00000000 --- a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make +++ /dev/null @@ -1,60 +0,0 @@ -CMAKE_PROGRESS_1 = -CMAKE_PROGRESS_2 = 69 -CMAKE_PROGRESS_3 = -CMAKE_PROGRESS_4 = 70 -CMAKE_PROGRESS_5 = -CMAKE_PROGRESS_6 = 71 -CMAKE_PROGRESS_7 = -CMAKE_PROGRESS_8 = 72 -CMAKE_PROGRESS_9 = -CMAKE_PROGRESS_10 = 73 -CMAKE_PROGRESS_11 = -CMAKE_PROGRESS_12 = 74 -CMAKE_PROGRESS_13 = -CMAKE_PROGRESS_14 = 75 -CMAKE_PROGRESS_15 = -CMAKE_PROGRESS_16 = 76 -CMAKE_PROGRESS_17 = -CMAKE_PROGRESS_18 = 77 -CMAKE_PROGRESS_19 = -CMAKE_PROGRESS_20 = 78 -CMAKE_PROGRESS_21 = -CMAKE_PROGRESS_22 = 79 -CMAKE_PROGRESS_23 = -CMAKE_PROGRESS_24 = 80 -CMAKE_PROGRESS_25 = -CMAKE_PROGRESS_26 = 81 -CMAKE_PROGRESS_27 = -CMAKE_PROGRESS_28 = 82 -CMAKE_PROGRESS_29 = -CMAKE_PROGRESS_30 = 83 -CMAKE_PROGRESS_31 = -CMAKE_PROGRESS_32 = 84 -CMAKE_PROGRESS_33 = -CMAKE_PROGRESS_34 = 85 -CMAKE_PROGRESS_35 = -CMAKE_PROGRESS_36 = 86 -CMAKE_PROGRESS_37 = -CMAKE_PROGRESS_38 = 87 -CMAKE_PROGRESS_39 = -CMAKE_PROGRESS_40 = 88 -CMAKE_PROGRESS_41 = -CMAKE_PROGRESS_42 = 89 -CMAKE_PROGRESS_43 = -CMAKE_PROGRESS_44 = 90 -CMAKE_PROGRESS_45 = -CMAKE_PROGRESS_46 = 91 -CMAKE_PROGRESS_47 = -CMAKE_PROGRESS_48 = 92 -CMAKE_PROGRESS_49 = -CMAKE_PROGRESS_50 = 93 -CMAKE_PROGRESS_51 = -CMAKE_PROGRESS_52 = 94 -CMAKE_PROGRESS_53 = -CMAKE_PROGRESS_54 = 95 -CMAKE_PROGRESS_55 = -CMAKE_PROGRESS_56 = 96 -CMAKE_PROGRESS_57 = -CMAKE_PROGRESS_58 = 97 -CMAKE_PROGRESS_59 = - From df57731ace57e2f46ec5b16942694a3f23489f82 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Wed, 16 Sep 2015 16:44:49 -0700 Subject: [PATCH 113/124] removed unnecessary cmake files --- src/caffe/.OCL_kernel.cl.swo | Bin 98304 -> 0 bytes src/caffe/Makefile | 2279 ----------------- .../CMakeDirectoryInformation.cmake | 16 - .../CMakeFiles/gtest.dir/DependInfo.cmake | 32 - src/gtest/CMakeFiles/gtest.dir/build.make | 106 - .../CMakeFiles/gtest.dir/cmake_clean.cmake | 10 - .../gtest.dir/cmake_clean_target.cmake | 3 - src/gtest/CMakeFiles/gtest.dir/depend.make | 2 - src/gtest/CMakeFiles/gtest.dir/flags.make | 8 - src/gtest/CMakeFiles/gtest.dir/link.txt | 2 - src/gtest/CMakeFiles/gtest.dir/progress.make | 2 - src/gtest/CMakeFiles/progress.marks | 1 - src/gtest/Makefile | 212 -- src/gtest/cmake_install.cmake | 34 - 14 files changed, 2707 deletions(-) delete mode 100644 src/caffe/.OCL_kernel.cl.swo delete mode 100644 src/caffe/Makefile delete mode 100644 src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake delete mode 100644 src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake delete mode 100644 src/gtest/CMakeFiles/gtest.dir/build.make delete mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake delete mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake delete mode 100644 src/gtest/CMakeFiles/gtest.dir/depend.make delete mode 100644 src/gtest/CMakeFiles/gtest.dir/flags.make delete mode 100644 src/gtest/CMakeFiles/gtest.dir/link.txt delete mode 100644 src/gtest/CMakeFiles/gtest.dir/progress.make delete mode 100644 src/gtest/CMakeFiles/progress.marks delete mode 100644 src/gtest/Makefile delete mode 100644 src/gtest/cmake_install.cmake diff --git a/src/caffe/.OCL_kernel.cl.swo b/src/caffe/.OCL_kernel.cl.swo deleted file mode 100644 index 62349bbdafdf55d217551523bd623664e7967190..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 98304 zcmeI534B~tz4!yDSQHci;UU5W1(P-<&C&(h0Hw5umKLF`v5wPZk_=5UAv5Vh1)mBo z$mWLno(pdHM8O4pATCc)d?<>jsJI}DiVKMPN7N@E|L^bY_s+eOnWSwHl25;#nS0MY z_nhC^e!t&2_uO^!PC4%QXyKteIXI;JH<*#aL+B`mxFUj!<{+1qp z%-j=~pR|1GiOXJGE9ki@!nWRge}7@_aA9nqF!#*yk)fhFvU#2U(pDbpHQ!b*TbWy5 z7#k@J9ojooE9#kZGZL7Qz=RUm_2kZu_IVQQ1qU4%?eUzGC%j@Nz>EZDBrqd^841it zU`7Hn5}1*|j09#R@WhcoW%RjCuO>TtnJisoK6glb?r45rXg)7aoPSE<{Jtgv8_!b{ z=dVbdf0{Xesrl?MH?-I9Y>vOoeBO{a|8#SFiTT`PC;i1O+^ zCeA-2asI}{`DdE*3(e<06X$m|$B#Cj^Nql4`gSwN+s)_3#QA5L<42j#|4N)Uw`=M# z=ac1wYe)Uk0~FpUL|7Yjb{)xjtF{ew8?%tbe~W=dFB__3s{YeyNE+S^w@f=aqAS zFvO`Jb8d z)&cqaME&@wIdAJzf8xA}cIIP70y7erk-&@uW+X5pff)(RNMJ?+PiP6``tPID0t?_E_y>7Kd>%ds7eha&y5m~OtXZcWx}m3YZS;!nNWRTmyST|6>yAz=RaWLYJDPQ%*62ij zZ0gFj=jOH8aP!tibM&>Ha_u=$u|H~FHC7rQ=_^M^L>=>6qAfm3bDNI3ZZj{}?rzf& zzl}x4WFIF!G z-`Uq{Q5~ttPA%j1MU$6ydup;LFYN}#sP=_E*^Nqj8po*i1wPrnMRirRDBpZsi}GT1 zEy`C0*P_(mR@I_{0o_+6e~(&KX&T3<_8{4fmhm)>QSE^q(x|kjag1u8f<`cnV^n*f z5%?C>S=FL^^KmW8Hy_ucyl&`PRC;G04CsxP@pO(+L9!cdQPVj_1$s!M(w@dKO6f6T z|2Jd1eM%bA+@nhKc_ru%aEI1vO!G7=nHvRYDoAAHz4QPXRV86c|UISv!?*$KI ztKSCShws4&@NMk!o8dY*9Tvc|;kVe}pMiHk1}s^;au1Z1JD7_hR3j>Z-(pP^)L=gp&kB--TVN20X`0!U;w&ccX$Tu0#ApX z;V5kA3U+ZBUJ3ibZ?T8J0b(D&23`rr!v63hWPB@J3)jG>LFC>{y7q+oNyF_RK7dcc zd*Ds50eazhXga4Uii9XCjaPb#BYlNUVhauDH*LrdEmkM=>Bt<@C!727m3&Y0NHja@ z9n6pLL`6B86ZN+6yF8k&6!Sy5zTz;4${12QGSqsxO@C=DYA%jcqF!Q}x0sJ5E{es` zoH@j?MFm{rZ#Z|w$~C#;m!76$kc-P)tP&d?9Ny3e7;ypGQUdgo0&>c&N5+TaX|}hQ zyXZjO(LkY+8yG6BBUi<~X7MaYPI^ltWeG!Se3o0y;We`(5m9Tz7bGjO=FGDv(S}m7 zFVfGUoO~txc7}@aRy!QizuH4&}?`XblS>+_79PU#W~0*Ns;S zxm?uTJe(gH7%KGTM)Jdj=5ndO!cB60rSWw`g_f3}YhXv@WMJ zQ*~~rR4%s?la`LVWGKI`Fx0AKqp~8Ipgu^*p;B*t$ORlLl*flEC-siyE4_orwXwYP z50&zjda|Mdrsu?*N@T$y8PH!u`t%WY~@RHD_1XDx-xhC^5b(Wmakg2-1H<3lw5m9 zmDD(-v#jzx@!d?Ntg2heV>oxHG|=o?d{2?i(RAb$y7k^%Z@yevLUTT%S+((=;@XzA zzGqh*dy#rh&0HK6b^j_~sLSQz*#%QmQ*O;zaOgZ;qja^BZj~z}F5mmg5!e4pry?yN z79!}M6CJMU;C%l^&+IZ9^kE!MbOv%{qK<$tv0RZ;*OyJcuP$zwsJ~{I-4JDu9H!Gv zbkAkF;6jjP+tW)Y-*6{P0&=Pw%hr^-VI_ zGh+G&2_O|F)C&-fu5oe_y+kVoR9CMX%$6|b0ChO3lq&flQ;Ths>R7xM#_FjJS6iEq zP;G6aV$tjH>-37NkK|o z6s4(NeL&KXQ3Y!zA*fVA3d~WRfS{(u6lCJnI{mT_!@iy-UXxZa(FR_|3Q2Y58-FH z8$J&ofww>rPKG03HtYxwVAKB;J_GNAH-OCl=V28r2C@BjhexsPW&Z!Ga21>nXTce; z0``Yp;IH@{Zh`CJLvRU*kK!bdHPw5=PH;bkdOLgy-UDxhSHen=wcj%T|15X}pT+Os zhwx>%5?%`Q|@Cc9C8^EA%jbWUMT zbQ))HI;SuvI*l_povCx8hgG#G-+Ww)^3BK1;P^=$*P>GAL_34Y#YVf$X`I0clig^G zn#LKN&MC}^PU8$t=M?5dr*V+NS?3n;`9W$?Rg3b?$F(Tmd|ZpFcTTi37||PTQPVj_ z1<7u-MNQ`z6(}N&N_!f|sE$C9X;j+NI7Z=6PH9mKt6G$AKCVUi=HptFU)bfwC^fhx zRr`)$KyS2+r*Vu5lig?;PvaQX5$GX}N;`Xus>l9+54L^Lp3nrhW8Z%ru7Y>NS+Ew4g9Bk#_$#*kJ#al- z0dIvf;S@L$#Qy&`cp5y2ZGR`+1fPWqVH5Ph5@?3qLDv8O5bXN@^Z738|IYwf|Gz)j z_5ZhE-^=>{4}q-zAAplU*8lGfJHh?f^*@1cz%?Li5-x;s5Ffx{a4?Ad|2u4ZvH!me zSHf#y1jPRD0I~mfh6l0jZ-#3@?Eftw_Wud61ong`5c~h@Aol;eLDv75L3{wm!(kvk zfZafz0r)Jjq2O{ zpd<3?Tl=Vxtn_iy(zZJv>a3x#@;saO@%RrtB5c~f@?0T{PZ-UQ)JpaE5 z#Qt9bV*l?Bk7C>Z5WWgx|DO+H|DORXV1L*J#Qwhp#Qy&f$o#)N|1Um(Q(+m14`3gV zH2^;cvHxZMU+n*J$irc9Fo^vx&;E=3|7ExmUJD~2_P;#)FZTb=@F2GR&2TM<{V&h{ zi~WBBbc4+Qix1!_@F#40@d4Zj?}ayk*#F0a*#8HC>;Z5uw*B`(?ElL^?Em#}Dzw4# zKPXK5I?|U z*!X{fpMmTzcQt$*WIq7$2@Jx?&lc*WJ8UwFKkvD3SfPk@AL=mJAIA&PJ?5WY2T*c z>exNxa&96Yennv{gh_?mo3F$s2a1a$pB%L=DUwsodZ80*icV(9n2*pLm^;@W8kaR? z<~Ua}G4HY`YY-*lBwG^i7txv}vcW1w6$=aXwTxU0`mo+8q17E-qr_VtEDiOA2g~E5 z;m_hozyFyc36*^x!mSFFdv-;$bxKImy24m%V)5e%C$2g+7e?8fy43cZsEKvemGQBW zhzCbz$xr1>3Wl0(Jx8=~3I@?MYHwk(8y#wp6ds~oh5N+#8 zq6Jehh^A3{3zOYwliGgT8=Yi)Z=3~FFo>qn$FN`u2GKP77%eXvKwtu(>O{k2z;}RnxE4-N(HJ+qtc$vQ7X_00>y7zZ|XBY7}Faq zVqf0~*T8vj4u~y&Hptw251awVf;>Av7iPgd*vQ`ndB$JX&##B0U^kF`%Wr_quow=3 zr-RIqe*><7ewYWl!OyXGZ-KAFm*I=>1yJ%t1oiy6=MH1fgC`ybFAjI~mWDE(sL?0G zCw+oPje_TAa`H&Ge`MZUd*;lP zovp#-I%oy-1hzft)Y+2k)`9dRajuWhVN?~6=WV;IJyh{0B5QU^<}tNIhg5Vo3wsGI zpzw&wad==`c1SjW2OF={-0M)6}n)xy|~^ z5{|Xl$1-Iz4g91_qE)tgl8Tw-I*nJ%x)M*RhY9EB=^5qhGvL43zH!|+#XDwl6|qCl zcww=eYq9}cK4e6-S>a^;xsi7y4;O|R#5?>}30yuq(l-z5tV%$5z}(f>-&!h2sdtDQ zW$j1f)8Uh0Yh*kn`lyyhRbsLpAx~6OotBK+nU)k&^WrH{2Qw2w4V}fmS!snht7%rU z=w?l(%)G3qVe!PR$w*jwZ`0J)Rv77vO8t2I3v7s?|3(HP>07jSJ~vPpDX?)58#?LD zjL@N6$5pG~@!ucKTewT(Lqo-5GPWbCcJe!oE^Ue+ZOTVaF_^7}XaW7L_Pfkg*zYot zpeHk;*nEfiWhO;NhNX*XICm@3Hz?YAnbS2+%#%hFMFshzxAqPA!ZRaSt`H}vph5fi) zvX6|~qr+uN%SLQJ)eX+hx?E-5QJ`)LY< zB!n|*xf7d5%zU3rP`0WWO*=+%>6RvYz4(dM_^7r})3HS@O=ADcJMWeo``^BwO!oA< z5k3PKW7A&*>)}jT0TCPkk6`0}3*HE)!1G{VcnDkmR#*?ug}vZDZ2B+3f51jK7H-Fm z7aRWwm&d|?~UF5T-XbCguh^ue;+;p@@)G!^ubct1Mb5%zaBQg zQSb;h`Hdj^^1Tdpft$(mn?U8g9z#7VjFqw5C}N{BGsQj=EwNn5k5v}yA80K4Af2`B zo~M3@7g@*QVi+_rs81V%K8TH|qBSw7A3g>dgnCPb{{EazL~}cBR@*|(DpRo=8&$r$ zD&sHLH>1YKE;rVXG49l**4C9Dw(Uza+Or^Srk5c0Lj8$F=Z;~?N$29tptK0v+M>c) z<3$vhg4m#Js#h518(UnJMvUy_o=!5{cDOi#mCbH~${9h2)!rK1B$i`+nJOnquh`)# zoh{Lk*yxALk-7Fr7-5)QSFGmN8xyy7R$A=N`YLzlja7HvSo_Y%rMEOZ$_|2R+}9DY zav}`1N*VEC)}^3&DJng&dfPT1AVbXRI#ZS7RUu8v z|GIABQ1o8NMpyQFvj3(>UGEg@(Sw#69!0~&&n4ewHCaNh71yxoQf(x4MPq8Zy^;2@ zsrwX%6P{wa$jvy&oO@XP2tCA`!n5i}s>j&%hALZh>Faah=(|3r#$PQjHxQ_>frdbT zzgZlv;o!`%O>rlzh{E4a_2nwn-W@KrnnB+)7~_N-2A5GmD)B-ksoWG5~In~GZ6lH5dzNu;GCHagH;KV}=LizD2YYdnw> zg1)64+l?Zisz+O(PoeEt$@r{>x$}8d($RXL(F?1uAWTyCsJdkf%1HG+b#BrwmhfI~ z{+?P$HiHL;K6fPqs{xQA_qAH*Fb&%V(hb}Bv&}kBk$Sdbl2|u6VYi+>x?7Qio#Eqh z;IzqaWXw!=el;+R)>c(A#?ROgA0I#;my(U8H&eW#fQbEnD0b};#{Ta%HtwgH>;EXc zA6^Hq23ZFn&-4EZ`~H`3EBq6?{^ua;@ZSk#koEVS@O1bQHvXqT-n+LHc7g}6^KXT# z;bM?A_J_jXvGsohvZnqW@Mbt4UI(v*5&QtM9{vv6DZ z(AGs>%Rf*x=ITqHM6Loccr(U{hDNkH%+|D_d|R5bHvRPsai;6pN&|9k^LTzF)3TM5 zF_F8urPxRpPj(&h-IlV9rHnqcL33Z3?30yBw!J^iLJ1RQWHNzuo^7kkW28DedKhsI zR@+TyV)jSWFj`WQsW(D3Hx%&H;Z!&aTYojo1=)9B_T0M_J_xec{&DbB_%3$-n_&&K!(G_*?}jsA zAGi%0|9sd2WwH0+BzPILVDtYIo8CU(E^G4-hMnLJYp=FddlVaA=Kkf` z^o?*9ltATIXLY;%Uze8(wYtLUjB-ZXv8OY-qFOa?s{N4b@4}mqp0J7XGvziMHS=A% zw#Vu`Cwt^FSMlR5!76hy)VW|Z4_Q;7M;`B6@UsjuzoGBS7d^vm91r#f(+9LZ`BOew znRV9VGiu7nQL-H;Hr&$5A7a*OrPR0qTU zWF-}LE{}gfg{jqYdK`@Dx?2nU(sm_Su-Is3IT}_HV}qWCoLP=$#=)S5Fkm!4d6uJj zW80_!ERzl1Be<cic z%MctY4Bj3Ok0P-~7L|E9t1&FgO0Oa2aP0e6m2#Meq`sQ=Y_u2h6J(qh`+sjN;Jp*} z|38@Lm-qYq9b}LHv#|MB!5sJp_Wt+bU9bk81HZ=RzYflYP6) zwW2h(F+bK98oMS1@x}A&N|j1!I47QmaI9jW3V(VV-&gB8tm2pi!}!AD3Xhw59pB2r z)pdVwTam(R%`w`94=2?o7MZF?t`P$NFpv zW1-#jj=j14j8F1#UvveT>gkSx;_l%f2}?XMAVE1wP#F#bPq!rB{7K@%h+-kv@;I-) zoqwQEu4BZJ!O(0vqXu0ZNb)Qrx{i8`F%0iQ%lM}in8l<>o?OtO2gA@oVo)@iW#zmb z5BS~d9}Tow@?&LqX?K&!pi&H+Am*|=xjm8no^A=@lmo6!*UctY%zSI8L#+z*p`dT8 zN23a_RHfq7pxSy(Jvx(B`qGq6!j>Tib*L_`pVjxlzARR)>sgn4=U+MT$Pe3 zR`2~$k=9;up@QhhV#_kjF3#~K7stnj_S%Cs+C)&{Lp|h)txowKKhURwQ8{g-HX&6n zJQZ&g_P(0x%2K-Q`LN`Iq*;YQ&AcW33sdz&IW;qO7g7{f4SuHFhoi<Y_jOB|X$7JjblM*y+F4cCU ztA=?83MNyg3k5^q#mr9$qJVw(tn*CVyRXEp3@OP0^iGtN<|q zddG*$);r#PrTYBrGX8EDV)>EnQn~_LCQWXyHto1jzA<;#j#7 zWSGKI*$aw z;v;F;QVA}tZK}|Iv^zS}QrUJ+--d^aY*VFoXuBFK+itWVi?&+{6#l?MnTp3Z?q%2d zQ1uLLJXGxg{E4hn1#M^ohrE@79fIrCtm2YZgUi&4!)O#i{~a1_+qH>!>1`c-%Evd+ z^hlao<()@vWqhRmY>|2GC%>T}eo;MLsAfR*sG!5gwJZqelB-A!`D{;i2jwv~b<_WyoZy+;}Q|7c?Y--b;ud;f`jUx3BX z1hUt^ytnT}cs~4%x&N=h2+00_N5NmP|9=9XgxA5zuroY{PvHOH+wd(o6@S28*!OqB zx8XdHz51UCU&FQ^gC+29@E7d(uY%b6{|-6mgh!bB|6lkdtb_gFDR3<|{}{Xi_5^tb z;C{Fj-VZ}C7akG20WHNzAz)ICs3pIB;YE@x=Er~|0mwc3scD6L> zOZ){=Ek>=I90h_4tfPE1@RUmf5Pue--B%WJtL^)B#ZOF7`1d>(_=F&0tjX4 zdojQNLE6-W5tCV}Fx$@R2eb8SA}3^L()id_-JnQivqF|z+LadW>3?=KQX3bB#Syyy z{Wa7Z(}7dV%(jItLfJ&nOmzF51$5j=Roc|uNcS2+N(1$s{SfV<4xdsv#uke)iNuoX zF7ipGe!CqOO^iONoV1%Du&9H`tCDJs1J$N{QbTJ%+rOC`9{7u5`hWbY>LnijGHJbT zort>3Z!?rkZ>Zr1H3W07p9zCVwrbLwH}^8G>3XMfmL5ROG3ef8V&gzJ@Je2uk&NU| z=8Y;P4v_w=hes!v%8+QhAoSXYn6qA=v?Cc^i*0rODqAZ<^{27v{iZ}R^irbg19jhuv{aQ@tNSa_mB8*Tpy{6PDp`-J?>+4!#V|%bxzZPMN&fG#d zbM0-y5+vl4*X(fRJz1-wT9_G@MRma1Qh(xF7f$ABwu5v8sx`3Q%9{x|^Dq;}3uOiq zl~S~>z&6W#<~NksW3jKefwxD^UAKAe*@dxEG&)vV z$8OJMNnuu`en6>z;YD!GjQdi2==hss+DtfNW~^BCGR$7H8Z}ADrKl*qrE%Vtt)2JV z&3PFgvQ!e{{=1+Ngd_D6$#H2rNpwN6|BuGTJqUYO=KtG`z56BR`(@w1jUf8~6d(sn zU=b{Ue(e8uVb{yPf9F9bHvO~VKJ5DI;35zke}9nu{AJF6H9U-ce;vFBL#Pz~A9-Fo11;8)^Rmd=cITJ)rV4f$d)3X{o%v zn%D1|Ymy7ycy&x=^JpQ}Wo*uPHBR+`X2`Fl1y<>MDyAdqeCvsEY1$i1;A6JIYU>>_ zMF@32Y(Mo2E|9j?u7d+H<1&ms?5o1#PP-2xnYzspM3yvrtPE^7B34hG%Y+j}lA#PH zLe(yjP)&N07Dz%F*5#om@U`9-l$6&Rtqf?UBpnPx9f`)S?}@=W|jX zxY1YC^-NEb4QoG@b7WwAtl<17BHSWxzLJNc62ZOKReUjZxM}rW{AG1IYxQNePnSxO zs!hkbGb9xDV!GzpT&8&u?KwX>h2BK}{*j4%h{O{{VU=P1^tbaGI+x-^uEdmNg;gox z)2rbyjHk=``poNDbh<+Sp-WeW-_WK@DrN&7!tk<&e1zeJF}*tBB{a=2Wy||2f1z!$ zrf#I@)M=}i#ml!9-F|U*RYSv(ogROZ4n<^NH?&~LnaVN&vwGYrXIVO2B3NyW!Rjxz z?+T0~8jj6?%&Y`w>_s|va;XmByOkOBZz#_@nrgjeF;Pqz>R%#M?ZPy=k1rOP-wVnu zyF{z5e6kj3s`@DUV8hCzS0ep48;u<+8c`!d1wp1YZL@1=PFC!H%;=^Fn_2Atql{5} zJ@);#`={$KV6TnY2xH~0nQJ%F!;m%?-4 z2lxp79Y$dtEQAHn4sCEP{({YL6#N5!!S!%1JQZ%lS8yS$fTzJX@e|wxp96U>z(RN- z+=H*+F1QmGz+1@VZ}AoUM*IbEDlCHi;ft&TxE?+ae?$&S7Df(kfqbmx@}A7${K$Yj zvp$j^E;OH5Sb6e9y>J*NXthZXl{wd1~ieuxFS+G#BY3(MVx{H{@+7 zA1Xn0}Hm^Q{~md+kep_>wU_rPGO-QR13-+6d}3t=9tZS>zYKO z8`3Az3Vwxtv|rbt>Q!!%7ZaqeS{#fvK{a)6^NmZqp&DvZq3XwE)9I>VkW60#TceE1 z*Va~lzp&Mk*(KE@nNAK;6*3cQTZwHCvaN*uPD?n_|2*8Cm>n+@UK+d=s{h1QMpg76 zd%BXhGUXFbPeuio5!9=jezykKXl(7%fkF*jw{w4S!#+c_f$-*&^jaojsb6DO7gpsn zWp#2~1Wv>Ej^vtyT*4)t)j=eJEz#kQ2s#UEqv~Z7UhXM{X4>Ktz06EDQ;IAmfGL)k ziYwMrMrm^mOH{b@0%On>l~vU^>b&|3nv(5WSW;vb7Wy$%!iI`(+HVz~T~5UkLR?Xm zVsEZDU#=`+z&)bbN+?UmZ3!2g>DXLS>T=D_ZZcPykAA!JLA`4g^>zNTh8%`+u`T4R z8Tk5rhAyV=CY>t(zm?BkHvSek z8wOw=%!2P=>wg^H1_Q7Hc86Wz8L$h?#{NH+cqQ(Qa2hOy{b66&2lfVeU%*#L!+FpL zOCf^Cu>bFeufYW%dj$5wI*@$=bFdQjg$MBsTnqBvK=Bov2<`9)K7zl)U*RFR2i^yH zcquf)&Tub2gCE1yP=-a&0eix;VK>+no(cblKS7>FconRMeL>bF+zwxYx55DY8{97b z1$ZA^1Pfsg_&dIa{|66%JeP1EoDD7T2tJ7a1$hoZ{-mV!pHuRjTEGgLa%{|7$K+Qu zm@nsw10$ueLT)I(j_##Z*WuWqCL61scT@VNw1+;~s6xJ!M;U$dL%0-Jr0N4F6Cvxf zl8>;P^r@SByVV{omCM0u)tX9Yxzt}7&Tq;MaZRp&G^e77=`dP_(ZN;{%2*o5o^Uc& zJ~KUbe*xZKNhY)gqW_mHDRaK;^_F0}AfIFTwx-#vV&mqqE=*G9ub(?q8o)QSso6=T zM;;qMgbcO1EYwOljs@c7TzRdRapFM^m5^jYQ~5m#XBhE)bWR7FQRy*LQJ~l!sHTMVa9Z-n~3cV%O@+ zR%GlLwnYV86UjZC=BawLT9r=q7A^o!p7`!h8FeRl8PA=}p{tCz7g_6M4alq@`S4B3W*h#8Vkb*%vnivR4TYMzmB(2g&g* zinPUB6bGbPsA8$BLFvouZWP)sle=ushB5WR6oIHx48$mf8DET$C?&EPlv1*!lhd(D zXLwa)>(~mcx00rU_(oP!naPSV%axW9KFE`Kjf(TlmJ?tEVrtd<5 zu^xKUZnBmzPC;n0)PK9>&??Mf2F)?4S`16U;1_m^bA6|XRJK#67zJa^HtH2s5pU&Q zu{^GuNL2;(?IY+KWiYSVhud1I+rIt9bO)v7)R#sY9C>C~9_MM2Y6~J!6A2pRBCAy? zJ)>qGf;wgRq!GB09+2TufRC=dj;~aAN&>D z{T6r+oCO0=fIhef`+Y6E9FBuOVz=K4pNF@=C2%o3gw1{moD4@B`~64A^M8yD|3N;L ze6?sB{NL-^o2}k}f{-5?9n9;opJqgVVZG*TVA|3ohvcvJxEmik^L>4}G7lc-+3MF< zX(k-?54MevTgHu&%^8z%_0Es-q*rExl&hemAWIKc~P4P49g4Ej3o9k|RlDLev zHeuRKv#(B?IT5DJ9Fl~oF-RAkD{RWzmU~;Y&V0=*`G(ddOqdPEhw3bFFibu_v#LRT zowQWC%vvBPW7WW$Il4q`we(iJEon2ZH;{;o(yoyPbwza&Op%6~g)UX!m_r@I==;i7 zu@=KJ(YR6W^$r8pjZGWJZZN(LIs@j6LZ`NF_Y*$c-(dexLkg*VgUWpR-n;b`i7bE0 ztQT@Jqh3g-r+k>MzX)qYN|&HqD-CoDBrBt)G?c2Kk_3q|M{Syd9wDvni1$Ug7WItD z+6Z-3xiDw>ZA*|xi+1<9&STc(s+5j1=zEa#)~;p^iIBcWs#zdDj%d-P*Ssd(4CQ|{ zHpU#{_=`k0bWT8@Q5`yFIAu5mgBw&;zl00V-Yu9H(tdZPZQ5E~r=}b8ldoeVAE)Nh zF>2M&O!j*Zpc2kdl+1k-=%x0iP< z2L-4cx!!-IudLexMO&KeoJ5mJrB_c?lgjBCl}XM?_{eIyG`Y18S!+fjFx}nclW{w$ zR-;}sfb@b*cg(8+BzOKE&S#2tKo?)_IW@vaX^M1$rWv~J#w74m+M(@NeH-LRP+QV^ zuBmi9#I72TQxy`91D*M*E;gn2$>^w4`u`>!sr)aZigm+P!V3adr6O$#$M*IagMgmQ zak;JO)rj@=%!&HGK2zc=^rGIXo?bowk7@4K|MU2``rCxvFZ=!958nrw|6c>$@NC!( zHe>7G#l?5Rx8Y1U1`dU%!Bb!-I0KvhYHawc;1lp}I0p8Io#75_`cK0r;cD0iWDfrv zI1^;v{s-_CxB%V=vhM%sAo~P74Iac^KN)0CzzFWbUjGF=gnhmX%!0pSo4*Kl2HErP zI(R2+fR{i7$v>$KzDeA) zOH?NyHM-V|OH?E3lJ`UVoq&%bI1M*DOz35F~4p ztssaz*=u7p#z(HSp)jT%srDy$L@kySF<}ZK4%g0tUd0`YwY|iH9>JNf*bs|!{2U`>@|h}-5r!(r!JL>;x?3?>s)eiLfKx*t?k zm)&@^pSWT-Sf@?WYFFvn>320c>|!#~KT&mUS)8b;woZmmYi#rSSv$X~dQMzaS#>X? z+NJlo$vMT*(#Gt08#x(s1XU%{cC)w%bOD+Al=NrYa5!S4jP{V4vW#jpal%X;a_Q3b zB$+G|N~J33(CC)}7cn<$wr=89?LVYVph5V*AhZpMA(wD9rgU`@LnGbY#Q8=kaCuI3 z9cKBH>nxQ5yT!4mac+1#+pipiiPxS)-)k#H=wk|NQQQ}9L)Sma5Tql^sTc%_Cp9^W z`udc}ggFs6Qe|Y!kR;UjX=D{qY5AxuSkO%WUUJGQ)HF-Q+pUvDDE!nC2iO|fN;DaPu zWLW5STFBMi#(M{UpRBU?AyQMK%C;&YL5a)!|9p(zi^TS2{(rSGfWLzM|DSL^6hYnr z*akboec1l87r-Y#o()(A^Pvr%1AoCU@I`nZjKE^(gx}%|_$k~5{{xr7CU`OI1-rrx z_zFHN{sOoX{tG?|7vm>53%|f9tc5%93w#OQ3niEbhk~r@mv;nS3|l~a0`p-nxEFuG zjqq+LK^yD>GOzz*xB;#Nnb$uDR>6TF{)0c`Gx!Q@fwiz2j)PX%4Zex|J`PHbNhd#xw8?v*tyXWtkg3OL>)lu_dO~MYGI`?>h|W(_p+&KZj5#NrP$+cVOUHY z^<)?mC$Tu9wkC;x<6sdpNTcMH6#QikEAn<_V%`Ai(wrfcG$E#zci8$acrJmoVg zwF)P-_Qg2<(qBf^nI zZbh$ktlHF;9_=V5Iey2a7HmtAD1SnVw5H-_rl6gJy+-YJVPd*pb+2M;@wRrwn}(r$ z3f2C2b2e??I3?Kz8jH)gs-39~#RZu_Hl}`WYtiUNmLVB^GB#Ubl4S*xvV=h38ReWN5QGXfyFdmr`8fXn;sOq3kD)=(0+e?i95Auxd>< z8Mb66qq?iv(xD;s0wrC+^OdNFxyQp>Bc7_?xK>pQtKjPpPUZG0sY`-W+UrvAwPueE zP*bzFch#SX7n5manED+m{61mZNFf*d|ApAE`(WdW{lCE2syAWhp9>q{#qd1%3AX(O z@G95<{0;uD=-f668AsS} zop;?!;t!7BMx{$G2-2m(O}ua?vmBY;jaNmCnAGxhj?FY07q^yLWSWkgraG&SgH52I zwz^iN>%DEXwmAnOZNpG8#n;-Y4pOH>-zm7v-oH@E4Gfjm@q7_WY}=LpfgVIEDe%Y7r#E0E`5_mn`13}d>ne1w}XA$cBkPo-4et{Z)GEpsVwW5ptDu;Kh8Np zxL{Vt^b!&qfAY!|>$A*^AnC?zQkSa#Y%R~ey9sPlN>;POM%^Z~m4LpHEqixi%=aQs z21pf1KkKs_$Uq(#sY-b~Qo9f4_V&<7j%P~QP5$~yyp2j(?El@d^z!%s#Qwj?Sbeu* z?|&EG4ljr0a14kaU|)C?oBv^uy#YQ4Z-o-9gcf)j{0bky-EbFN1gFEPAbSM701kpb z;|ur-h;QH`*by$nH}G~i2l_#tA6O1cp&1T_J>W6c1l$kz!Y|=_@FlnsJ_hfE*MYoe zpag5-IFNS@%!e1iv)~bY5B~?hfgghGHz>~|ybt~p-T<=K;7eg4JPRJiC-H0e9()Ym z0k476VL7zH^WZ`J6?Yonh3rLe32cS}bVCd51%JSo@e}wOd=iJO}=X zZ{zoH1Bl<_U2q<}99|4v@Jx_*B+4EJx4;!3`x2fDr@+zB3j4sH@QwT&Zi36sTw()H;AY*6!k)=i=j7Jihj|L45OEeDhpv5{XR4@D3phx1qXpcNn(CMYP48P@W9mpvb*ij8 zCNJyu)KpJi*6pdOPL(z8HQUEGD(meyGLb}4zx~j}GQcL=rw8MU9#& zN}_VawHj3mhgeGqzl3Btn`CWj0u5+q2=JRM}73qt=bK^C;q%nHnrsY;fUZ#moC z!~%)f20pD4ZgEXH+8_#lgnV5klyZt}VaelHO!Jyf)TDCG;e%hMlt4=y@^{S|MD3>4Z%Z-ka#{TH=h$kc1>qb5Q|1<2* zk7KKf{Xf^(aDTwYzY^XI6?hpO4=;t6z&Eh(m%>pn7k-B=e-((GFM9zV2hFe-+>cFv z4ak0gmw?RwzX{$5V()(mTYdrj8C(2Ma0h%GjAbSBWgdIWb z>94>?;A~h8JHS`4i$4snhjBOn9>y-d3w{beg4^H+unu;IU-7)aC*eYnKke<4|J`h{ zag`}ErdHPIx@}ZmLo)#*K>WS!+{O(07vbMBHSu0n)3(m|w z|7=k%TClBje!6ih?5Shcf=hQe(WW{<(C~49V7nph!Vb38vFn9x^5r&eqA)h)Cw8jM zV9=_V+)mpcXFKi`LkiNG6NeK`Nsi3y^FOoCe`$neB~sC|9^~+Im%*MUQulqswmVAM z2~yQO{dWBJu(S@dO*R{-`>pe{rbm-qdAJerq!Q3KRLZBMVN!9Kp(vIibIk0{S>N~+ zW=dY}ki9o-b;w}$S9Lp9(5^W3BiMSu3_eFk?%_t_;u{ zOWI#S1!7E@RGpzQWj;`x$W84R7uY@ho8m2Ed*>xXMVnk!0;MewwSUYAiRhu+PK|Rw z{9CHI%)BVW^zwAwy4uG3a8Z=v>zXE4NUqukLyS8X^F(Bf358AByVaQ_5hSa>4Ek-i zYeB7t>}n*UcHNy+8iE=edq7Si3p#bJkp|kV8WNN$TU(D!Ex;Nn4eLVo^Jp^v{{Xh} zEyn(rcLB)X7Hs^3;6T_7zJ-l{5o`q62e1?5S^u5jr`Y)Khdy`(90g0@a5xMW!9rL7 zGVgyMHvbi{8Dzh}7lQZ!euEF-i|`g$20Mbx?cW5~LLR!|AlMJ~f#Rj0Zh@U9Ww%i%O$bIw#+o zJoBd984tR%a09ECUQ`?r+8JRT7QFG*J1qDScUVZjq|Ni0cjY2F-l*+;ypQL1 CMakeFiles/gtest.dir/gtest-all.cpp.i - -src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s: cmake_force - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/gtest.dir/gtest-all.cpp.s" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++ $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp -o CMakeFiles/gtest.dir/gtest-all.cpp.s - -src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires: -.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires - -src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires - $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build -.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides - -src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o - -# Object files for target gtest -gtest_OBJECTS = \ -"CMakeFiles/gtest.dir/gtest-all.cpp.o" - -# External object files for target gtest -gtest_EXTERNAL_OBJECTS = - -lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o -lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/build.make -lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/link.txt - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libgtest.a" - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean_target.cmake - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/gtest.dir/link.txt --verbose=$(VERBOSE) - -# Rule to build all files generated by this target. -src/gtest/CMakeFiles/gtest.dir/build: lib/libgtest.a -.PHONY : src/gtest/CMakeFiles/gtest.dir/build - -src/gtest/CMakeFiles/gtest.dir/requires: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires -.PHONY : src/gtest/CMakeFiles/gtest.dir/requires - -src/gtest/CMakeFiles/gtest.dir/clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean.cmake -.PHONY : src/gtest/CMakeFiles/gtest.dir/clean - -src/gtest/CMakeFiles/gtest.dir/depend: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake --color=$(COLOR) -.PHONY : src/gtest/CMakeFiles/gtest.dir/depend - diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake deleted file mode 100644 index 694feb83..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FILE(REMOVE_RECURSE - "CMakeFiles/gtest.dir/gtest-all.cpp.o" - "../../lib/libgtest.pdb" - "../../lib/libgtest.a" -) - -# Per-language clean rules from dependency scanning. -FOREACH(lang CXX) - INCLUDE(CMakeFiles/gtest.dir/cmake_clean_${lang}.cmake OPTIONAL) -ENDFOREACH(lang) diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake deleted file mode 100644 index 2c9ec14f..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake +++ /dev/null @@ -1,3 +0,0 @@ -FILE(REMOVE_RECURSE - "../../lib/libgtest.a" -) diff --git a/src/gtest/CMakeFiles/gtest.dir/depend.make b/src/gtest/CMakeFiles/gtest.dir/depend.make deleted file mode 100644 index 37ac348d..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/depend.make +++ /dev/null @@ -1,2 +0,0 @@ -# Empty dependencies file for gtest. -# This may be replaced when dependencies are built. diff --git a/src/gtest/CMakeFiles/gtest.dir/flags.make b/src/gtest/CMakeFiles/gtest.dir/flags.make deleted file mode 100644 index 8b4ef992..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/flags.make +++ /dev/null @@ -1,8 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# compile CXX with /usr/bin/c++ -CXX_FLAGS = -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe - -CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE - diff --git a/src/gtest/CMakeFiles/gtest.dir/link.txt b/src/gtest/CMakeFiles/gtest.dir/link.txt deleted file mode 100644 index e5645cfb..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/link.txt +++ /dev/null @@ -1,2 +0,0 @@ -/usr/bin/ar cr ../../lib/libgtest.a CMakeFiles/gtest.dir/gtest-all.cpp.o -/usr/bin/ranlib ../../lib/libgtest.a diff --git a/src/gtest/CMakeFiles/gtest.dir/progress.make b/src/gtest/CMakeFiles/gtest.dir/progress.make deleted file mode 100644 index 143c9b1b..00000000 --- a/src/gtest/CMakeFiles/gtest.dir/progress.make +++ /dev/null @@ -1,2 +0,0 @@ -CMAKE_PROGRESS_1 = 65 - diff --git a/src/gtest/CMakeFiles/progress.marks b/src/gtest/CMakeFiles/progress.marks deleted file mode 100644 index 573541ac..00000000 --- a/src/gtest/CMakeFiles/progress.marks +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/src/gtest/Makefile b/src/gtest/Makefile deleted file mode 100644 index d1a96ceb..00000000 --- a/src/gtest/Makefile +++ /dev/null @@ -1,212 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 2.8 - -# Default target executed when no arguments are given to make. -default_target: all -.PHONY : default_target - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Remove some rules from gmake that .SUFFIXES does not remove. -SUFFIXES = - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E remove -f - -# Escaping for special characters. -EQUALS = = - -# The program to use to edit the cache. -CMAKE_EDIT_COMMAND = /usr/bin/ccmake - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe - -#============================================================================= -# Targets provided globally by CMake. - -# Special rule for the target edit_cache -edit_cache: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." - /usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) -.PHONY : edit_cache - -# Special rule for the target edit_cache -edit_cache/fast: edit_cache -.PHONY : edit_cache/fast - -# Special rule for the target install -install: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." - /usr/bin/cmake -P cmake_install.cmake -.PHONY : install - -# Special rule for the target install -install/fast: preinstall/fast - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." - /usr/bin/cmake -P cmake_install.cmake -.PHONY : install/fast - -# Special rule for the target install/local -install/local: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." - /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake -.PHONY : install/local - -# Special rule for the target install/local -install/local/fast: install/local -.PHONY : install/local/fast - -# Special rule for the target install/strip -install/strip: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." - /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake -.PHONY : install/strip - -# Special rule for the target install/strip -install/strip/fast: install/strip -.PHONY : install/strip/fast - -# Special rule for the target list_install_components -list_install_components: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" -.PHONY : list_install_components - -# Special rule for the target list_install_components -list_install_components/fast: list_install_components -.PHONY : list_install_components/fast - -# Special rule for the target rebuild_cache -rebuild_cache: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." - /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) -.PHONY : rebuild_cache - -# Special rule for the target rebuild_cache -rebuild_cache/fast: rebuild_cache -.PHONY : rebuild_cache/fast - -# The main all target -all: cmake_check_build_system - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/progress.marks - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/all - $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0 -.PHONY : all - -# The main clean target -clean: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/clean -.PHONY : clean - -# The main clean target -clean/fast: clean -.PHONY : clean/fast - -# Prepare targets for installation. -preinstall: all - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall -.PHONY : preinstall - -# Prepare targets for installation. -preinstall/fast: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall -.PHONY : preinstall/fast - -# clear depends -depend: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 -.PHONY : depend - -# Convenience name for target. -src/gtest/CMakeFiles/gtest.dir/rule: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/CMakeFiles/gtest.dir/rule -.PHONY : src/gtest/CMakeFiles/gtest.dir/rule - -# Convenience name for target. -gtest: src/gtest/CMakeFiles/gtest.dir/rule -.PHONY : gtest - -# fast build rule for target. -gtest/fast: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/build -.PHONY : gtest/fast - -gtest-all.o: gtest-all.cpp.o -.PHONY : gtest-all.o - -# target to build an object file -gtest-all.cpp.o: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o -.PHONY : gtest-all.cpp.o - -gtest-all.i: gtest-all.cpp.i -.PHONY : gtest-all.i - -# target to preprocess a source file -gtest-all.cpp.i: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i -.PHONY : gtest-all.cpp.i - -gtest-all.s: gtest-all.cpp.s -.PHONY : gtest-all.s - -# target to generate assembly for a file -gtest-all.cpp.s: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s -.PHONY : gtest-all.cpp.s - -# Help Target -help: - @echo "The following are some of the valid targets for this Makefile:" - @echo "... all (the default if no target is provided)" - @echo "... clean" - @echo "... depend" - @echo "... edit_cache" - @echo "... gtest" - @echo "... install" - @echo "... install/local" - @echo "... install/strip" - @echo "... list_install_components" - @echo "... rebuild_cache" - @echo "... gtest-all.o" - @echo "... gtest-all.i" - @echo "... gtest-all.s" -.PHONY : help - - - -#============================================================================= -# Special targets to cleanup operation of make. - -# Special rule to run CMake to check the build system integrity. -# No rule that depends on this can have commands that come from listfiles -# because they might be regenerated. -cmake_check_build_system: - cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 -.PHONY : cmake_check_build_system - diff --git a/src/gtest/cmake_install.cmake b/src/gtest/cmake_install.cmake deleted file mode 100644 index 14c33dd5..00000000 --- a/src/gtest/cmake_install.cmake +++ /dev/null @@ -1,34 +0,0 @@ -# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest - -# Set the install prefix -IF(NOT DEFINED CMAKE_INSTALL_PREFIX) - SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") -ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) -STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") - -# Set the install configuration name. -IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) - IF(BUILD_TYPE) - STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" - CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") - ELSE(BUILD_TYPE) - SET(CMAKE_INSTALL_CONFIG_NAME "Release") - ENDIF(BUILD_TYPE) - MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") -ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) - -# Set the component getting installed. -IF(NOT CMAKE_INSTALL_COMPONENT) - IF(COMPONENT) - MESSAGE(STATUS "Install component: \"${COMPONENT}\"") - SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") - ELSE(COMPONENT) - SET(CMAKE_INSTALL_COMPONENT) - ENDIF(COMPONENT) -ENDIF(NOT CMAKE_INSTALL_COMPONENT) - -# Install shared libraries without execute permission? -IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) - SET(CMAKE_INSTALL_SO_NO_EXE "1") -ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) - From 6b9fe7a5719828d3935f8729c9824e67aecce6b7 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Thu, 17 Sep 2015 14:28:14 -0700 Subject: [PATCH 114/124] integrate Mauricio's code review suggestions --- include/caffe/util/im2col.hpp | 30 ----------- src/caffe/layers/dropout_layer.cpp | 2 + src/caffe/ocl/im2col.cl | 75 +++----------------------- src/caffe/ocl/random.cl | 36 ++++--------- src/caffe/util/im2col.cpp | 84 +----------------------------- src/caffe/util/math_functions.cpp | 21 +++----- 6 files changed, 26 insertions(+), 222 deletions(-) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 531b11ad..327d7415 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -52,26 +52,6 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset); -template -void im2col_gpu(const Dtype* data_im, const int channels, const int height, - const int width, const int kernel_h, const int kernel_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, Dtype* data_col); - -template -void col2im_gpu(const Dtype* data_col, const int channels, const int height, - const int width, const int patch_h, const int patch_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); - -template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_col, const int col_offset); - -template -void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_col, const int col_offset); - template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, @@ -88,16 +68,6 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_im, const int img_offset, int optnum); - -template -void col2im_gpu_ocl(cl_mem data_col, const int channels, const int height, - const int width, const int ksize, const int pad, const int stride, - Dtype* data_im, cl_kernel Kernel); - -template -void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height, - const int width, const int ksize, const int pad, const int stride, - Dtype* data_col, cl_kernel Kernel); #endif } // namespace caffe diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 2cb50ead..905ed6ec 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -82,6 +82,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, // NOLINT_NEXT_LINE(whitespace/operators) DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data); } else { + if(bottom_data != top_data) caffe_gpu_copy(count, bottom_data, top_data); } } @@ -99,6 +100,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, // NOLINT_NEXT_LINE(whitespace/operators) DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff); } else { + if(bottom_diff != top_diff) caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); } } diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 09f240cf..d03463ee 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -23,39 +23,6 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ - -template -__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) { - int index=get_global_id(0); - data_im = data_im + img_offset; - data_col = data_col + col_offset; - if(index < n) { - int w_out=index %width_col; - index /= width_col; - int h_out=index%height_col; - int channel_in = index/height_col; - int channel_out=channel_in *ksize *ksize; - int h_in = h_out *stride-pad; - int w_in = w_out *stride-pad; - data_col +=(channel_out *height_col + h_out) *width_col + w_out; - data_im +=(channel_in * height + h_in) *width + w_in; - int i=0,j=0; - for(i=0;i= 0 && w >= 0 && h < height && w < width) - *data_col=data_im[i * width + j]; - else *data_col=0; - data_col +=height_col *width_col; - } - } - } -} - -template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); - template __kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { @@ -93,7 +60,7 @@ template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_op template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); template -__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset, +__kernel void im2col(const int n, __global const T* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, @@ -127,17 +94,17 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in } } -template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, +template __attribute__((mangled_name(im2col_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, +template __attribute__((mangled_name(im2col_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset); template -__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset, +__kernel void col2im(const int n, __global const T* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, @@ -171,46 +138,16 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i } } -template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, +template __attribute__((mangled_name(col2im_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w,const int pad_h, const int pad_w, const int stride_h, const int stride_w,const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, +template __attribute__((mangled_name(col2im_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); -template -__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) { - int index = get_global_id(0); - data_col = data_col + col_offset; - data_im = data_im + img_offset; - if(index < n) { - T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; - int c = index / (width * height); - // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); - // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; - } - } - data_im[index] = val; - } -} -template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); - template __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { int index = get_global_id(0); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl index 7f8bc5b3..468240f0 100644 --- a/src/caffe/ocl/random.cl +++ b/src/caffe/ocl/random.cl @@ -1,33 +1,12 @@ -/************************************************************************************* - * Copyright (c) 2015, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, - * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - **************************************************************************************/ #pragma OPENCL EXTENSION cl_amd_printf : enable -//beginning of the looooooong gpu_random_generator kernel -//we use the open sourced threefry's GPU implementation +//Note: random generator has two parts +//first part: the open sourced threefy random generator kernel from DE Shaw Research +//second part. we wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators. + +//begin: the open sourced random generator from DE Shaw Research +//https://www.deshawresearch.com/resources_random123.html typedef uint uint32_t; struct r123array4x32 { @@ -803,6 +782,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, } return X; } +//end: the open sourced random generator from DE Shaw Research template __kernel void PRNG_threefry4x32_bernoulli( @@ -847,6 +827,8 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t //end of the looooooong gpu_random_generator kernel +//We wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators. + template __kernel void PRNG_threefry4x32_uniform( __global float4 *randomnumber, diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 89985534..6899d15a 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -153,7 +153,7 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset) { - std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix(); + std::string kernel_name = "im2col" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; @@ -200,7 +200,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset) { - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); + std::string kernel_name = "col2im" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; @@ -241,46 +241,6 @@ template void col2im_gpu(const double* data_col, const int col_offset, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im, const int img_offset); -template -void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_col, const int col_offset) { - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height_col * width_col; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); - clFinish(amdDevice.CommandQueue); -} - -template void im2col_gpu(cl_kernel Kernel, const float* data_im, - const int img_offset, const int channels, const int height, const int width, - const int ksize, const int pad, const int stride, float* data_col, - const int col_offset); -template void im2col_gpu(cl_kernel Kernel, const double* data_im, - const int img_offset, const int channels, const int height, const int width, - const int ksize, const int pad, const int stride, double* data_col, - const int col_offset); - template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int ksize, @@ -327,45 +287,5 @@ template void im2col_gpu_opt(const double* data_im, const int ksize, const int pad, const int stride, double* data_col, const int col_offset, int optnum); -template -void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im, const int img_offset) { - std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix(); - cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int num_kernels = channels * height * width; - - cl_int ret; - ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); - ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); - ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); - OCL_CHECK(ret); - - size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; - size_t uiLocal_Work_Size[] = { 256 }; - OCL_CHECK( - clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, - uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); -} - -template void col2im_gpu(const float* data_col, const int col_offset, - const int channels, const int height, const int width, const int psize, - const int pad, const int stride, float* data_im, const int img_offset); -template void col2im_gpu(const double* data_col, const int col_offset, - const int channels, const int height, const int width, const int psize, - const int pad, const int stride, double* data_im, const int img_offset); #endif } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index aebeb5ed..93af3e23 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -463,7 +463,6 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; - //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, @@ -640,10 +639,12 @@ void caffe_gpu_axpy(const int N, const double alpha, const double* X, template <> void caffe_gpu_sgnbit(const int n, const float* x, float* y) { + NOT_IMPLEMENTED; } template <> void caffe_gpu_sgnbit(const int n, const double* x, double* y) { + NOT_IMPLEMENTED; } template <> @@ -656,19 +657,10 @@ void caffe_gpu_abs(const int n, const double* x, double* y) { caffe_gpu_abs_ocl(n, x, y); } - -//template void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, NULL, NULL); -// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL)); } -/* - template void caffe_gpu_memcpy(const size_t N, const long* X, long* Y); - template void caffe_gpu_memcpy(const size_t N, const unsigned long* X, unsigned long* Y); - template void caffe_gpu_memcpy(const size_t N, const int* X, int* Y); - template void caffe_gpu_memcpy(const size_t N, const unsigned int* X, unsigned int* Y); - */ template <> void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) { OCL_CHECK( @@ -766,7 +758,6 @@ template <> void caffe_gpu_dot(const int n, const double* x, const double* y, double * out) { //need to pass in scratchBuff - //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n * sizeof(double)), NULL, NULL); cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, @@ -796,7 +787,6 @@ void caffe_gpu_dot(const int n, const float* x, size_t offx, const float* template <> void caffe_gpu_dot(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) { //need to pass in scratchBuff - //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n * sizeof(double)), NULL, NULL); cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, @@ -896,6 +886,7 @@ void caffe_gpu_scale(const int n, const double alpha, const double *x, template void set_kernel(const int n, const Dtype alpha, Dtype* y) { + NOT_IMPLEMENTED; } template <> @@ -1003,21 +994,23 @@ void caffe_gpu_powx(const int N, const double* a, const double alpha, } void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) { + NOT_IMPLEMENTED; } void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) { + NOT_IMPLEMENTED; } template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, const float* y) { - return 0; + NOT_IMPLEMENTED; } template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, const double* y) { - return 0; + NOT_IMPLEMENTED; } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { From efd5dba821070af28951b90293792d528b590f70 Mon Sep 17 00:00:00 2001 From: Junli Gu Date: Thu, 17 Sep 2015 14:57:01 -0700 Subject: [PATCH 115/124] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a1bf49d6..ebc83a1a 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,27 @@ -#OpenCL caffe +#OpenCL Caffe -This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. +This is an OpenCL implementation of Caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete Caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. #Design features - -All caffe layers ported to OpenCL + -All Caffe layers ported to OpenCL -Performance improvement by batched implementation for conv layer based on clBLAS - -User can choose optimal batch number depending on H/W, image size and minibatch size + -The user can choose the optimal batch number depending on H/W properties, image size and minibatch size -Supports OpenCL 2.0, 1.2 - -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users + -Implemented in C++ and OpenCL, maintaining the same interfaces as the original Caffe -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19 -Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future. +Note: More features are planned in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered for future addition. #Performance -We will keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved. +We intend to keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved. * Training speed (Model: AlexNet, minibatch size 128) @@ -48,7 +48,7 @@ We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical sug As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together. #License -Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license. +The original Caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license. # Original Caffe information ## Caffe From 660df235de85a15e1cd4482e35706334b3e6016a Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 18 Sep 2015 15:27:25 +0800 Subject: [PATCH 116/124] comment on where the code is modified for OpenCL port --- src/caffe/layers/absval_layer.cpp | 3 ++- src/caffe/layers/bnll_layer.cpp | 3 ++- src/caffe/layers/concat_layer.cpp | 3 ++- src/caffe/layers/contrastive_loss_layer.cpp | 3 ++- src/caffe/layers/conv_layer.cpp | 4 ++-- src/caffe/layers/deconv_layer.cpp | 2 ++ src/caffe/layers/dropout_layer.cpp | 3 ++- src/caffe/layers/eltwise_layer.cpp | 3 ++- src/caffe/layers/euclidean_loss_layer.cpp | 3 ++- src/caffe/layers/exp_layer.cpp | 3 ++- src/caffe/layers/filter_layer.cpp | 3 ++- src/caffe/layers/hdf5_data_layer.cpp | 3 ++- src/caffe/layers/hdf5_output_layer.cpp | 3 ++- src/caffe/layers/im2col_layer.cpp | 2 ++ src/caffe/layers/inner_product_layer.cpp | 2 ++ src/caffe/layers/log_layer.cpp | 2 ++ src/caffe/layers/lrn_layer.cpp | 2 ++ src/caffe/layers/mvn_layer.cpp | 2 ++ src/caffe/layers/pooling_layer.cpp | 5 ++--- src/caffe/layers/power_layer.cpp | 4 ++-- src/caffe/layers/prelu_layer.cpp | 2 ++ src/caffe/layers/reduction_layer.cpp | 2 ++ src/caffe/layers/relu_layer.cpp | 2 ++ src/caffe/layers/sigmoid_layer.cpp | 2 ++ src/caffe/layers/silence_layer.cpp | 2 ++ src/caffe/layers/slice_layer.cpp | 3 ++- src/caffe/layers/softmax_layer.cpp | 4 ++-- src/caffe/layers/softmax_loss_layer.cpp | 4 ++-- src/caffe/layers/split_layer.cpp | 4 ++-- src/caffe/layers/tanh_layer.cpp | 2 ++ src/caffe/layers/threshold_layer.cpp | 2 ++ 31 files changed, 62 insertions(+), 25 deletions(-) diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 6e06b558..20898f15 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -36,7 +36,7 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void AbsValLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -57,6 +57,7 @@ void AbsValLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(AbsValLayer); diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index ed9cc1d4..68a19265 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -39,7 +39,7 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void BNLLLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -62,6 +62,7 @@ void BNLLLayer::Backward_gpu(const vector*>& top, BNLLBackward(count, top_diff, bottom_data, bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(BNLLLayer); diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 5cceb9ff..5def30d4 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -95,7 +95,7 @@ void ConcatLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void ConcatLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -139,6 +139,7 @@ void ConcatLayer::Backward_gpu(const vector*>& top, offset_concat_axis += bottom_concat_axis; } } +// end: code modified for OpenCL port #else STUB_GPU(ConcatLayer); diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 6dda7d61..3410b927 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -100,7 +100,7 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void ContrastiveLossLayer::Forward_gpu( @@ -156,6 +156,7 @@ void ContrastiveLossLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(ContrastiveLossLayer); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index b64eb1aa..30da288a 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -69,7 +69,7 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, @@ -227,7 +227,7 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, } } -// end: code written/modified by AMD +// end: code modified for OpenCL port #else STUB_GPU(ConvolutionLayer); diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index 5b0eeb03..ddf906b7 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -70,6 +70,7 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, @@ -124,6 +125,7 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(DeconvolutionLayer); diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index 905ed6ec..21699414 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -67,7 +67,7 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code is written/modified by AMD +// begin: code modified for OpenCL port template void DropoutLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -105,6 +105,7 @@ void DropoutLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(DropoutLayer); diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 971703f4..84cc279c 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -154,7 +154,7 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void EltwiseLayer::Forward_gpu(const vector*>& bottom, @@ -241,6 +241,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(EltwiseLayer); diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 2130c6f4..ea78484b 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -44,7 +44,7 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -72,6 +72,7 @@ void EuclideanLossLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(EuclideanLossLayer); diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index 3fe7cde4..ad40bb1b 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -61,7 +61,7 @@ void ExpLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void ExpLayer::Forward_gpu(const vector*>& bottom, @@ -95,6 +95,7 @@ void ExpLayer::Backward_gpu(const vector*>& top, caffe_gpu_scal(count, inner_scale_, bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(ExpLayer); diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index 2cd9957d..884764b4 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -117,7 +117,7 @@ void FilterLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void FilterLayer::Forward_gpu(const vector*>& bottom, @@ -178,6 +178,7 @@ void FilterLayer::Backward_gpu(const vector*>& top, } } +// end: code modified for OpenCL port #else STUB_GPU(FilterLayer); #endif diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 28eee444..c87304b0 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -158,7 +158,7 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void HDF5DataLayer::Forward_gpu(const vector*>& bottom, @@ -197,6 +197,7 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, } } } +// end: code modified for OpenCL port #else STUB_GPU_FORWARD(HDF5DataLayer, Forward); diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 11d01647..0005fb94 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -68,7 +68,7 @@ void HDF5OutputLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -103,6 +103,7 @@ void HDF5OutputLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { return; } +// end: code modified for OpenCL port #else STUB_GPU(HDF5OutputLayer); diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 38e1fd20..36245446 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -88,6 +88,7 @@ void Im2colLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void Im2colLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -112,6 +113,7 @@ void Im2colLayer::Backward_gpu(const vector*>& top, bottom[0]->offset(n)); } } +// end: code modified for OpenCL port #else STUB_GPU(Im2colLayer); diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index b40e3e7d..cfa4246a 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -120,6 +120,7 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void InnerProductLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -157,6 +158,7 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0); } } +// end: code modified for OpenCL port #else STUB_GPU(InnerProductLayer); diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 5dbbca74..a01c9c18 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -80,6 +80,7 @@ void LogLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void LogLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -126,6 +127,7 @@ void LogLayer::Backward_gpu(const vector*>& top, } caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } +// end: code modified for OpenCL port #else STUB_GPU(LogLayer); diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index da3d1fc3..0c91435b 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -252,6 +252,7 @@ void LRNLayer::WithinChannelBackward(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void LRNLayer::CrossChannelForward_gpu( const vector*>& bottom, const vector*>& top) { @@ -310,6 +311,7 @@ void LRNLayer::Backward_gpu(const vector*>& top, LOG(FATAL) << "Unknown normalization region."; } } +// end: code modified for OpenCL port #else STUB_GPU(LRNLayer); STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 2c4acb14..d64f5670 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -123,6 +123,7 @@ void MVNLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void MVNLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -228,6 +229,7 @@ void MVNLayer::Backward_gpu(const vector*>& top, caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(MVNLayer); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index 0becf164..812ffbb3 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -314,7 +314,7 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void PoolingLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -408,8 +408,7 @@ void PoolingLayer::Backward_gpu(const vector*>& top, LOG(FATAL) << "Unknown pooling method."; } } - -// end: code written/modified by AMD +// end: code modified for OpenCL port #else STUB_GPU(PoolingLayer); #endif diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index a0f5ccee..c3cb1759 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -95,8 +95,8 @@ void PowerLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void PowerLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -169,7 +169,7 @@ void PowerLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } } -// end: code written/modified by AMD +// begin: code modified for OpenCL port #else STUB_GPU(PowerLayer); #endif diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 75aa3968..55f2e303 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -129,6 +129,7 @@ void PReLULayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void PReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -197,6 +198,7 @@ void PReLULayer::Backward_gpu(const vector*>& top, slope_data, div_factor); } } +// end: code modified for OpenCL port #else STUB_GPU(PReLULayer); diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 0358d83a..ace74b28 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -125,6 +125,7 @@ void ReductionLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void ReductionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -212,6 +213,7 @@ void ReductionLayer::Backward_gpu(const vector*>& top, ++top_diff; } } +// end: code modified for OpenCL port #else STUB_GPU(ReductionLayer); diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index 132d7b4b..3d2eaf2e 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -35,6 +35,7 @@ void ReLULayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void ReLULayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -57,6 +58,7 @@ void ReLULayer::Backward_gpu(const vector*>& top, ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope); } } +// end: code modified for OpenCL port #else STUB_GPU(ReLULayer); diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 737bff74..b820e8ff 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -40,6 +40,7 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void SigmoidLayer::Forward_gpu(const vector*>& bottom, @@ -63,6 +64,7 @@ void SigmoidLayer::Backward_gpu(const vector*>& top, SigmoidBackward(count, top_diff, top_data, bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(SigmoidLayer); diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index a6c30fbb..4436584b 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -17,6 +17,7 @@ void SilenceLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void SilenceLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -33,6 +34,7 @@ void SilenceLayer::Backward_gpu(const vector*>& top, } } } +// end: code modified for OpenCL port #else STUB_GPU(SilenceLayer); diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index cd19fdb5..de21e936 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -112,6 +112,7 @@ void SliceLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void SliceLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -151,7 +152,7 @@ void SliceLayer::Backward_gpu(const vector*>& top, offset_slice_axis += top_slice_axis; } } - +// end: code modified for OpenCL port #else STUB_GPU(SliceLayer); diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 366946bd..1269b058 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -91,7 +91,7 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void SoftmaxLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -148,7 +148,7 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); } -// end: code written/modified by AMD +// end: code modified for OpenCL port #else STUB_GPU(SoftmaxLayer); #endif diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 2241bd6c..ef03ec7e 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -124,7 +124,7 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port #ifndef CPU_ONLY template void SoftmaxWithLossLayer::Forward_gpu( @@ -191,7 +191,7 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, } } } -// end: code written/modified by AMD +// end: code modified for OpenCL port #else STUB_GPU(SoftmaxWithLossLayer); #endif diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 57677b5b..e92f7bf2 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -60,7 +60,7 @@ void SplitLayer::Forward_gpu(const vector*>& bottom, } } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void SplitLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { @@ -80,7 +80,7 @@ void SplitLayer::Backward_gpu(const vector*>& top, caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); } } -// end: code written/modified by AMD +// begin: code modified for OpenCL port #else STUB_GPU(SplitLayer); #endif diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 7a15809d..f62092b2 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -38,6 +38,7 @@ void TanHLayer::Backward_cpu(const vector*>& top, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void TanHLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -60,6 +61,7 @@ void TanHLayer::Backward_gpu(const vector*>& top, TanHBackward(count, top_diff, top_data, bottom_diff); } } +// end: code modified for OpenCL port #else STUB_GPU(TanHLayer); diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index a4c543ee..eebc379a 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -25,6 +25,7 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, } #ifndef CPU_ONLY +// begin: code modified for OpenCL port template void ThresholdLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { @@ -34,6 +35,7 @@ void ThresholdLayer::Forward_gpu(const vector*>& bottom, // NOLINT_NEXT_LINE(whitespace/operators) ThresholdForward(count, threshold_, bottom_data, top_data); } +// end: code modified for OpenCL port #else STUB_GPU_FORWARD(ThresholdLayer, Forward); From ab0b360f331ecc0289d7ae06a773fa6ec220200f Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 18 Sep 2015 15:30:10 +0800 Subject: [PATCH 117/124] Go through 1x1 convolution --- include/caffe/common.hpp | 2 +- src/caffe/layers/base_conv_layer.cpp | 18 +++++++++--------- src/caffe/layers/conv_layer.cpp | 2 +- src/caffe/test/test_convolution_layer.cpp | 3 ++- src/caffe/util/math_functions.cpp | 12 ++++++------ 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 7aed6007..0b455c59 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -82,7 +82,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 0 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 04cd38dd..6c66ac12 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -291,15 +291,15 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); } col_buff = col_buffer_.gpu_data(); - } + } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_ + caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_ - * g, col_buff, col_offset_ * g, (Dtype) 0., output, top_offset_ + * g, col_buff, is_1x1_ * bottom_offset_ + col_offset_ * g, (Dtype) 0., output, top_offset_ + output_offset_ * g); } + } template @@ -316,13 +316,14 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, if (is_1x1_) { col_buff = input; } + for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm < Dtype - > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + caffe_gpu_gemm < Dtype> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_ - * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, col_offset_ + * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, is_1x1_ * bottom_offset_ + col_offset_ * g); } + if (!is_1x1_) { conv_col2im_gpu(col_buff, input); } @@ -339,8 +340,7 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ - / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_, (Dtype*) col_buff, col_offset_ - * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g); + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_ + output_offset_*g, (Dtype*) col_buff, is_1x1_*bottom_offset_ + col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g); } } diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index b64eb1aa..ed9950c4 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -124,7 +124,7 @@ void ConvolutionLayer::Forward_gpu_opt2( template void ConvolutionLayer::Forward_gpu_org( const vector*>& bottom, const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); //CHECK_BLOB_DATA(bottom[i],10,"bottom"); diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp index 67d41fff..576095c1 100644 --- a/src/caffe/test/test_convolution_layer.cpp +++ b/src/caffe/test/test_convolution_layer.cpp @@ -122,10 +122,11 @@ class ConvolutionLayerTest : public MultiDeviceTest { } virtual ~ConvolutionLayerTest() { - delete blob_bottom_; + /* delete blob_bottom_; delete blob_bottom_2_; delete blob_top_; delete blob_top_2_; + */ } virtual Blob* MakeReferenceTop(Blob* top) { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index aebeb5ed..4f7e5c03 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -483,8 +483,8 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, beta, (cl_mem) C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); } @@ -523,8 +523,8 @@ cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta, (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event)); return event; @@ -566,8 +566,8 @@ cl_event caffe_gpu_gemm(cl_command_queue *queue, int ldb = (TransB == CblasNoTrans) ? N : K; int ldc = N; CLBLAS_CHECK( - clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, - (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta, (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event)); return event; } From 3acadc06d0966bfe1329787f39443b82ef3bb658 Mon Sep 17 00:00:00 2001 From: Yibing Date: Fri, 18 Sep 2015 20:53:14 +0800 Subject: [PATCH 118/124] Go through conv_layer --- include/caffe/util/ocl_util.hpp | 3 +-- include/caffe/util/ocl_wrapper.hpp | 2 +- src/caffe/blob.cpp | 4 +-- src/caffe/common.cpp | 1 + src/caffe/layers/base_conv_layer.cpp | 4 +-- src/caffe/layers/conv_layer.cpp | 2 -- src/caffe/ocl/im2col.cl | 8 +++--- src/caffe/syncedmem.cpp | 2 +- src/caffe/util/math_functions.cpp | 40 +++++++++++++++++++--------- src/caffe/util/ocl_util.cpp | 4 ++- src/caffe/util/ocl_wrapper.cpp | 14 ++++++---- 11 files changed, 51 insertions(+), 33 deletions(-) diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp index dcdf1057..3027019f 100644 --- a/include/caffe/util/ocl_util.hpp +++ b/include/caffe/util/ocl_util.hpp @@ -32,8 +32,7 @@ namespace caffe { template void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0); -void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, - const int count); +void ocl_memset(cl_mem buffer, const int value, const int count); void eventCallback(cl_event event, cl_int event_status, void * user_data); #endif diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 61d6162e..308292c8 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -218,7 +218,7 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0); template -void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup); +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int _seed = 0); template void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V); diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 089899fc..ece07d14 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -426,10 +426,10 @@ void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { switch (Caffe::mode()) { case Caffe::GPU: if (copy_diff) { - caffe_copy(count_, source.gpu_diff(), + caffe_gpu_copy(count_, source.gpu_diff(), static_cast(diff_->mutable_gpu_data())); } else { - caffe_copy(count_, source.gpu_data(), + caffe_gpu_copy(count_, source.gpu_data(), static_cast(data_->mutable_gpu_data())); } break; diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index f42a4204..9ed4207a 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -105,6 +105,7 @@ void Caffe::set_random_seed(const unsigned int seed) { // RNG seed Get().random_generator_.reset(new RNG(seed)); caffe_gpu_uniform(0, NULL, seed); + caffe_gpu_uniform((float*)NULL, 0, (float)0.0, (float)1.0, seed); } void Caffe::SetDevice(const int device_id) { diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index 6c66ac12..a233e6c9 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -224,8 +224,7 @@ void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm < Dtype - > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ + caffe_cpu_gemm (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g, (Dtype) 0., output + output_offset_ * g); } @@ -292,7 +291,6 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, } col_buff = col_buffer_.gpu_data(); } - for (int g = 0; g < group_; ++g) { caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_ diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index c720bbb9..27777122 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -127,8 +127,6 @@ void ConvolutionLayer::Forward_gpu_org( const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->gpu_data(); - //CHECK_BLOB_DATA(bottom[i],10,"bottom"); - Dtype* top_data = top[i]->mutable_gpu_data(); for (int n = 0; n < this->num_; ++n) { //two intermediate variables to pass offset diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index d03463ee..2d9032db 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -94,11 +94,11 @@ __kernel void im2col(const int n, __global const T* data_im, const int img_offse } } -template __attribute__((mangled_name(im2col_float))) void im2col_gpu_kernel(const int n, __global const float* data_im, +template __attribute__((mangled_name(im2col_float))) void im2col(const int n, __global const float* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset); -template __attribute__((mangled_name(im2col_double))) void im2col_gpu_kernel(const int n, __global const double* data_im, +template __attribute__((mangled_name(im2col_double))) void im2col(const int n, __global const double* data_im, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset); @@ -138,12 +138,12 @@ __kernel void col2im(const int n, __global const T* data_col, const int col_offs } } -template __attribute__((mangled_name(col2im_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset, +template __attribute__((mangled_name(col2im_float))) __kernel void col2im(const int n, __global const float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w,const int pad_h, const int pad_w, const int stride_h, const int stride_w,const int height_col, const int width_col, __global float* data_im, const int img_offset); -template __attribute__((mangled_name(col2im_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col, +template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const int n, __global const double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index a3fa9973..76d3f2ea 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -115,7 +115,7 @@ inline void SyncedMemory::to_gpu() { fprintf(stderr, "Failed to create memory object\n"); break; } - ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int))); + ocl_memset(tmpMem, (int) 0, (int) (size_ / sizeof(int))); gpu_ptr_ = (void*) tmpMem; head_ = HEAD_AT_GPU; break; diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index cfd7709e..155aac45 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -110,6 +110,16 @@ void caffe_set(const int N, const double alpha, double* Y) { } } +template <> +void caffe_copy(const int N, const float* X, float* Y) { + cblas_scopy(N, X, 1, Y, 1); +} + +template <> +void caffe_copy(const int N, const double* X, double* Y) { + cblas_dcopy(N, X, 1, Y, 1); +} + template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { for (int i = 0; i < N; ++i) { @@ -124,16 +134,6 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) { } } -template <> -void caffe_copy(const int N, const float* X, float* Y) { - cblas_scopy(N, X, 1, Y, 1); -} - -template <> -void caffe_copy(const int N, const double* X, double* Y) { - cblas_dcopy(N, X, 1, Y, 1); -} - template <> void caffe_scal(const int N, const float alpha, float *X) { cblas_sscal(N, alpha, X, 1); @@ -226,13 +226,14 @@ void caffe_log(const int n, const double* a, double* y) { vdLn(n, a, y); } + template void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY // NOLINT_NEXT_LINE(caffe/alt_fn) - //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); + // caffe_gpu_copy(N, X, Y); #else NO_GPU; #endif @@ -242,6 +243,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) { } } + template void caffe_copy(const int N, const int* X, int* Y); template void caffe_copy(const int N, const unsigned int* X, unsigned int* Y); @@ -675,6 +677,20 @@ void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) { N, 0, NULL, NULL)); } +template +void caffe_gpu_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N * sizeof(Dtype), 0, NULL, NULL)); + } +} +template void caffe_gpu_copy(const int N, const float* X, float* Y); +template void caffe_gpu_copy(const int N, const double* X, double* Y); +template void caffe_gpu_copy(const int N, const int* X, int* Y); +template void caffe_gpu_copy(const int N, const unsigned int* X, unsigned int* Y); + +/* template <> void caffe_gpu_copy(const int N, const float* X, float* Y) { if (X != Y) { @@ -692,7 +708,7 @@ void caffe_gpu_copy(const int N, const double* X, double* Y) { &(amdDevice.CommandQueue), 0, NULL, NULL)); } } - +*/ template <> void caffe_gpu_copy(const int N, const float* X, const int offx, float* Y, const int offy) { if (X != Y) { diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp index 0b151e5a..bc2aea35 100644 --- a/src/caffe/util/ocl_util.cpp +++ b/src/caffe/util/ocl_util.cpp @@ -60,8 +60,10 @@ template void ocl_memset(int* buffer, const int value, const int count, con template void ocl_memset(float* buffer, const float value, const int count, const int buf_offset); template void ocl_memset(double* buffer, const double value, const int count, const int buf_offset); -void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, +void ocl_memset(cl_mem buffer, const int value, const int count) { + std::string kernel_name = std::string("OCL_memset2"); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); cl_int err; err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 7ffadc72..5bb4f2fa 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -120,12 +120,16 @@ template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data); template -void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int seed_) { - std::string kernel_name = "RNGUniform" + get_dtype_suffix(); + static unsigned c = 0; + if ((n == 0) || (a == NULL)) { + c = seed_; + return; + } + std::string kernel_name = "RNGUniform" + get_dtype_suffix(); cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); - static unsigned c = 0; unsigned nrounds = 20; array4x32 rndctr4; rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; @@ -144,8 +148,8 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) size_t localws[1] = {256}; OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); } -template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup); -template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup); +template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup, unsigned int seed_); +template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup, unsigned int seed_); void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed) { From 606117dde3ebe9a2b402a195e2d6201b6b133aba Mon Sep 17 00:00:00 2001 From: Yibing Date: Sat, 19 Sep 2015 00:08:22 +0800 Subject: [PATCH 119/124] Go through GPUMathFunctionsTest --- include/caffe/common.hpp | 2 +- include/caffe/util/ocl_wrapper.hpp | 3 ++ include/caffe/vision_layers.hpp | 4 +-- src/caffe/layers/conv_layer.cpp | 8 ++--- src/caffe/ocl/util.cl | 11 +++++++ src/caffe/solver.cpp | 2 +- src/caffe/test/test_math_functions.cpp | 2 +- src/caffe/util/math_functions.cpp | 41 ++++---------------------- src/caffe/util/ocl_wrapper.cpp | 18 +++++++++++ 9 files changed, 47 insertions(+), 44 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 0b455c59..7aed6007 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -82,7 +82,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 0 +#define use_packing_scheme 1 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp index 308292c8..0ce3a184 100644 --- a/include/caffe/util/ocl_wrapper.hpp +++ b/include/caffe/util/ocl_wrapper.hpp @@ -226,6 +226,9 @@ void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V); template void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); +template +void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y); + template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index e2a9b190..c3a73014 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -237,9 +237,9 @@ class ConvolutionLayer: public BaseConvolutionLayer { const vector*>& top); virtual void Backward_gpu_org(const vector*>& top, const vector& propagate_down, const vector*>& bottom); - virtual void Forward_gpu_opt2(const vector*>& bottom, + virtual void Forward_gpu_batched(const vector*>& bottom, const vector*>& top); - virtual void Backward_gpu_opt2(const vector*>& top, + virtual void Backward_gpu_batched(const vector*>& top, const vector& propagate_down, const vector*>& bottom); #endif }; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 27777122..4f0175e0 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -75,7 +75,7 @@ template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { if (use_packing_scheme && global_packing_N > 1) - Forward_gpu_opt2(bottom, top); + Forward_gpu_batched(bottom, top); else Forward_gpu_org(bottom, top); } @@ -84,13 +84,13 @@ template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (use_packing_scheme && global_packing_N > 1) - Backward_gpu_opt2(top, propagate_down, bottom); + Backward_gpu_batched(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); } template -void ConvolutionLayer::Forward_gpu_opt2( +void ConvolutionLayer::Forward_gpu_batched( const vector*>& bottom, const vector*>& top) { const Dtype* weight = this->blobs_[0]->gpu_data(); for (int i = 0; i < bottom.size(); ++i) { @@ -146,7 +146,7 @@ void ConvolutionLayer::Forward_gpu_org( } template -void ConvolutionLayer::Backward_gpu_opt2(const vector*>& top, +void ConvolutionLayer::Backward_gpu_batched(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl index eced284b..222e4ed9 100644 --- a/src/caffe/ocl/util.cl +++ b/src/caffe/ocl/util.cl @@ -57,6 +57,17 @@ __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) { template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y); +template +__kernel void caffe_gpu_sgnbit(const int N, __global T* X, __global T* Y) { + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =(X[gdx] < 0.0); + } +} + +template __attribute__((mangled_name(caffe_gpu_sgnbit_float))) __kernel void caffe_gpu_sgnbit(const int N, __global float* X, __global float* Y); +template __attribute__((mangled_name(caffe_gpu_sgnbit_double))) __kernel void caffe_gpu_sgnbit(const int N, __global double* X, __global double* Y); + template __kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx, __global T* Y, const int offy) { X += offx; diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 20af4160..b9ed1050 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -677,7 +677,7 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::GPU: { #ifndef CPU_ONLY // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), + caffe_gpu_copy(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp index a095b544..a0f88065 100644 --- a/src/caffe/test/test_math_functions.cpp +++ b/src/caffe/test/test_math_functions.cpp @@ -232,7 +232,7 @@ TYPED_TEST(GPUMathFunctionsTest, TestCopy) { const int n = this->blob_bottom_->count(); const TypeParam* bottom_data = this->blob_bottom_->gpu_data(); TypeParam* top_data = this->blob_top_->mutable_gpu_data(); - caffe_copy(n, bottom_data, top_data); + caffe_gpu_copy(n, bottom_data, top_data); bottom_data = this->blob_bottom_->cpu_data(); top_data = this->blob_top_->mutable_cpu_data(); for (int i = 0; i < n; ++i) { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 155aac45..e45fd564 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -110,6 +110,7 @@ void caffe_set(const int N, const double alpha, double* Y) { } } +/* template <> void caffe_copy(const int N, const float* X, float* Y) { cblas_scopy(N, X, 1, Y, 1); @@ -119,7 +120,7 @@ template <> void caffe_copy(const int N, const double* X, double* Y) { cblas_dcopy(N, X, 1, Y, 1); } - +*/ template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { for (int i = 0; i < N; ++i) { @@ -208,7 +209,7 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) { return; } for (int i = 0; i < N; ++i) { - Y[i] = alpha; + Y[i] = alpha; } } @@ -226,24 +227,13 @@ void caffe_log(const int n, const double* a, double* y) { vdLn(n, a, y); } - template void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - // caffe_gpu_copy(N, X, Y); -#else - NO_GPU; -#endif - } else { memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } } } - template void caffe_copy(const int N, const int* X, int* Y); template void caffe_copy(const int N, const unsigned int* X, unsigned int* Y); @@ -387,7 +377,7 @@ void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { template void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); template void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); -// + template <> float caffe_cpu_dot(const int n, const float* x, const float* y) { return cblas_sdot(n, x, 1, y, 1); @@ -641,12 +631,12 @@ void caffe_gpu_axpy(const int N, const double alpha, const double* X, template <> void caffe_gpu_sgnbit(const int n, const float* x, float* y) { - NOT_IMPLEMENTED; + caffe_gpu_signbit(n, x, y); } template <> void caffe_gpu_sgnbit(const int n, const double* x, double* y) { - NOT_IMPLEMENTED; + caffe_gpu_signbit(n, x, y); } template <> @@ -690,25 +680,6 @@ template void caffe_gpu_copy(const int N, const double* X, double* Y); template void caffe_gpu_copy(const int N, const int* X, int* Y); template void caffe_gpu_copy(const int N, const unsigned int* X, unsigned int* Y); -/* -template <> -void caffe_gpu_copy(const int N, const float* X, float* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } -} - -template <> -void caffe_gpu_copy(const int N, const double* X, double* Y) { - if (X != Y) { - CLBLAS_CHECK( - clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, - &(amdDevice.CommandQueue), 0, NULL, NULL)); - } -} -*/ template <> void caffe_gpu_copy(const int N, const float* X, const int offx, float* Y, const int offy) { if (X != Y) { diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp index 5bb4f2fa..0b4cbf6f 100644 --- a/src/caffe/util/ocl_wrapper.cpp +++ b/src/caffe/util/ocl_wrapper.cpp @@ -1568,6 +1568,24 @@ template void caffe_gpu_add(const int n, const float* in1, template void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); +template +void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_sgnbit" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void caffe_gpu_signbit(const int N, const float* X, float * Y); +template void caffe_gpu_signbit(const int N, const double* X, double * Y); + template void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); From ecbd837091ef06a6f0ab2a1a2e1b1308fe965e83 Mon Sep 17 00:00:00 2001 From: Junli Date: Sat, 19 Sep 2015 14:00:51 +0800 Subject: [PATCH 120/124] fixed im2col_opt paramters --- include/caffe/util/im2col.hpp | 8 ++-- include/caffe/vision_layers.hpp | 6 ++- src/caffe/ocl/im2col.cl | 46 ++++++++++++---------- src/caffe/util/im2col.cpp | 69 +++++++++++++++++++-------------- 4 files changed, 72 insertions(+), 57 deletions(-) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 327d7415..9c6de363 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -54,8 +54,8 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_col, const int col_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset, int optnum); template @@ -65,8 +65,8 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset, int optnum); #endif } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index c3a73014..381b983b 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -117,12 +117,14 @@ class BaseConvolutionLayer: public Layer { protected: inline void conv_im2col_gpu_opt(const Dtype* data) { im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, - conv_in_width_, kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, (Dtype*) transMem, 0, opt_num2); } inline void conv_col2im_gpu_opt(Dtype* data) { col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, - conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, data, bottom_offset_, opt_num2); } private: diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 2d9032db..26152470 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -24,7 +24,7 @@ * POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ template -__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { +__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { int index = get_global_id(0); @@ -34,20 +34,20 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c int x_out = index % width_col; int y_out = (index / width_col) % height_col; int channel_in = (index / width_col / height_col) % channels; - int channel_out = channel_in * ksize * ksize; + int channel_out = channel_in * kernel_h * kernel_w; int im_id = index / width_col / height_col / channels; - int y_in = y_out * stride - pad; - int x_in = x_out * stride - pad; + int y_in = y_out * stride_h - pad_h; + int x_in = x_out * stride_w - pad_w; int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; int offset_im = im_id * channels * height * width + channel_in * height * width; - for(int k_h = 0; k_h < ksize; k_h++) { - for(int k_w = 0; k_w < ksize; k_w++) { + for(int k_h = 0; k_h < kernel_h; k_h++) { + for(int k_w = 0; k_w < kernel_w; k_w++) { int x_im = x_in + k_w; int y_im = y_in + k_h; int index_im = y_im * width + x_im; - int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + int index_col = (k_h * kernel_h + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) data_col[offset_col + index_col] = data_im[offset_im + index_im]; else @@ -56,8 +56,8 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c } } -template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); -template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int tride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); template __kernel void im2col(const int n, __global const T* data_im, const int img_offset, @@ -149,25 +149,27 @@ template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); template -__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { +__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, +const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { int index = get_global_id(0); data_col = data_col + col_offset; data_im = data_im + img_offset; if(index < n) { T val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; int c = index / (width * height) % channels; int im = index / width / height / channels; // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); + int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col * optnum); + int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride_h * kernel_h * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col * optnum); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; @@ -176,8 +178,10 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset data_im[index] = val; } } -template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); -template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); template __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) { diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 6899d15a..241062a1 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -106,13 +106,13 @@ template void col2im_cpu(const double* data_col, const int channels, #ifndef CPU_ONLY template void col2im_gpu_opt(const Dtype* data_col, const int col_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset, int optnum) { std::string kernel_name = "col2im_opt" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = channels * height * width; cl_int ret; @@ -122,14 +122,17 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum); OCL_CHECK(ret); size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; @@ -140,12 +143,14 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, } template void col2im_gpu_opt(const float* data_col, const int col_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, float* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im, const int img_offset, int optnum); template void col2im_gpu_opt(const double* data_col, const int col_offset, const int channels, const int height, const int width, - const int ksize, const int pad, const int stride, double* data_im, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im, const int img_offset, int optnum); template @@ -243,15 +248,15 @@ template void col2im_gpu(const double* data_col, const int col_offset, template void im2col_gpu_opt(const Dtype* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, Dtype* data_col, const int col_offset, + const int channels, const int height, const int width,const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset, int optnum) { std::string kernel_name = "im2col_opt" + get_dtype_suffix(); cl_kernel Kernel = amdDevice.GetKernel(kernel_name); - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = optnum * channels * height_col * width_col; cl_int ret; @@ -261,14 +266,17 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset, ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset); ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); - ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize); - ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad); - ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride); - ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col); - ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col); - ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col); - ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset); - ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum); OCL_CHECK(ret); size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; @@ -279,12 +287,13 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset, } template void im2col_gpu_opt(const float* data_im, const int img_offset, - const int channels, const int height, const int width, const int ksize, - const int pad, const int stride, float* data_col, const int col_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col, const int col_offset, int optnum); template void im2col_gpu_opt(const double* data_im, const int img_offset, const int channels, const int height, const int width, - const int ksize, const int pad, const int stride, double* data_col, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col, const int col_offset, int optnum); #endif From 7ac0a963a9f8e48af895be4e684026d33d1be9f7 Mon Sep 17 00:00:00 2001 From: Junli Date: Sat, 19 Sep 2015 14:55:44 +0800 Subject: [PATCH 121/124] fixed im2col --- src/caffe/ocl/im2col.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl index 26152470..f1a97eab 100644 --- a/src/caffe/ocl/im2col.cl +++ b/src/caffe/ocl/im2col.cl @@ -47,7 +47,7 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c int x_im = x_in + k_w; int y_im = y_in + k_h; int index_im = y_im * width + x_im; - int index_col = (k_h * kernel_h + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + int index_col = (k_h * kernel_w + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) data_col[offset_col + index_col] = data_im[offset_im + index_im]; else @@ -168,7 +168,7 @@ const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, int h_col_end = min(h / stride_h + 1, height_col); // equivalent implementation int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col; - int coeff_h_col = (1 - stride_h * kernel_h * height_col * optnum) * width_col; + int coeff_h_col = (1 - stride_h * kernel_w * height_col * optnum) * width_col; int coeff_w_col = (1 - stride_w * height_col * width_col * optnum); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { From a894a29847b4fe2282d8e817b682cf206e869e5e Mon Sep 17 00:00:00 2001 From: Junli Date: Sun, 20 Sep 2015 01:14:36 +0800 Subject: [PATCH 122/124] direct is_1_1 conv to original scheme --- src/caffe/layers/base_conv_layer.cpp | 29 ++++------------------------ src/caffe/layers/conv_layer.cpp | 4 ++-- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index a233e6c9..e8241555 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -349,21 +349,16 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1); } -// begin: code written/modified by AMD +// begin: code modified for OpenCL port template void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { cl_command_queue Queue; const Dtype* col_buff = input; - if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu_opt(input); } col_buff = col_buffer_.gpu_data(); - } else { - caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, - (Dtype*) transMem); - } #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; @@ -402,11 +397,6 @@ template void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, const Dtype* weights, Dtype* input) { cl_command_queue Queue; - if (is_1x1_) { - caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); - } for (int g = 0; g < group_; ++g) { #ifdef multiQ if(g == 0) Queue = amdDevice.CommandQueue; @@ -426,26 +416,15 @@ void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, } #endif - if (!is_1x1_) { conv_col2im_gpu_opt(input); - } else { - caffe_gpu_memcpy( - height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), - (Dtype*) transMem, input); - } } template void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, const Dtype* output, Dtype* weights) { cl_command_queue Queue; - if (!is_1x1_) { - conv_im2col_gpu_opt(input); - } else { - caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, - (Dtype*) transMem); - } - opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + conv_im2col_gpu_opt(input); + opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, opt_num2); for (int g = 0; g < group_; ++g) { @@ -468,7 +447,7 @@ void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, } } -// end: code is written/modified by AMD +// end: code is modified for OpenCL #endif // !CPU_ONLY INSTANTIATE_CLASS (BaseConvolutionLayer); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 4f0175e0..99897e67 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -74,7 +74,7 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { - if (use_packing_scheme && global_packing_N > 1) + if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) Forward_gpu_batched(bottom, top); else Forward_gpu_org(bottom, top); @@ -83,7 +83,7 @@ void ConvolutionLayer::Forward_gpu(const vector*>& bottom, template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (use_packing_scheme && global_packing_N > 1) + if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) Backward_gpu_batched(top, propagate_down, bottom); else Backward_gpu_org(top, propagate_down, bottom); From 1511d4e6da461f8b05c24530785842863f9650ec Mon Sep 17 00:00:00 2001 From: Yibing Date: Sun, 20 Sep 2015 01:31:14 +0800 Subject: [PATCH 123/124] Removed unused variable in base_conv_layer --- Makefile.config | 7 +++++++ src/caffe/layers/base_conv_layer.cpp | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile.config b/Makefile.config index 829e2732..eea4c1f3 100644 --- a/Makefile.config +++ b/Makefile.config @@ -1,6 +1,13 @@ ## Refer to http://caffe.berkeleyvision.org/installation.html # Contributions simplifying and improving our build system are welcome! +# Use OpenCL + USE_OPENCL := 1 +# OpenCL directory + OCL_DIR := /opt/AMDAPPSDK-2.9-1 +# clBLAS directory + CLBLAS_DIR := /opt/clBLAS-2.1 + # cuDNN acceleration switch (uncomment to build with cuDNN). # USE_CUDNN := 1 diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index e8241555..5d99e04d 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -353,12 +353,10 @@ void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, template void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, const Dtype* weight, Dtype* output, bool skip_im2col) { - cl_command_queue Queue; - const Dtype* col_buff = input; + cl_command_queue Queue; if (!skip_im2col) { conv_im2col_gpu_opt(input); } - col_buff = col_buffer_.gpu_data(); #ifdef multiQ for (int g = 0; g < group_; ++g) { if(g == 0) Queue = amdDevice.CommandQueue; From 3318335404761a1e381f6daa64f0719e3e88e88c Mon Sep 17 00:00:00 2001 From: Junli Date: Sun, 20 Sep 2015 14:16:35 +0800 Subject: [PATCH 124/124] pass col2im_opt unit test --- include/caffe/common.hpp | 2 +- .../test/.test_gradient_check_util.hpp.swo | Bin 0 -> 16384 bytes src/caffe/layers/conv_layer.cpp | 16 ++++++++-------- src/caffe/util/im2col.cpp | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 include/caffe/test/.test_gradient_check_util.hpp.swo diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 7aed6007..0b455c59 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -82,7 +82,7 @@ private:\ //the following are macro defines for optimization schmes in conv layer /*ifdef: use proposed img_packing scheme; ifndef: use proposed packing im2col + sgemm scheme*/ -#define use_packing_scheme 1 +#define use_packing_scheme 0 /* global_packing_N defines packing number of the use_packing scheme for intial design, we use the same packing number for all conv layers*/ #define global_packing_N 16 diff --git a/include/caffe/test/.test_gradient_check_util.hpp.swo b/include/caffe/test/.test_gradient_check_util.hpp.swo new file mode 100644 index 0000000000000000000000000000000000000000..e3ebfc99f1458d91db2d1bab13d6b930f447c17f GIT binary patch literal 16384 zcmeHNO^h5z6)ubeBw&8R0VIA@_6n;P&)){5NbB{WWj$V7tQ~BxBfzm*HQhBcweIO| zbyx3tjFTLYI3kb`mxL=qzzq-*7nB@w3KBmON^TqwmvG?%93b$$s_NO9+4b5;K|*r1 z^v!fvy{dZk)vNbj&s4wj?4`^6>DK2Nt`9Tz-Shvr@z{x<9s9vo8Jko?kx}^ruJz03 zo7Z~Rd+XOWE*>TNg6XgwiNQcN$Fdm8=1!F+%9Tv|?PqeCZfrHJM3tAikYe1_vea#r zMoAS*E88|j^brmV5i6PMFdE6|c35eZv_^S;lpem$Gw^6KaDsjAsg(}Red*Lm{?R9I zJeoB4S$hUN1D*lTfM>un;2H1?cm^I(26TR$y@ zeqW{+as2;ZzyJT?LyY|d_%ZMu;0~|{^nt6u7l1CX1Uvye4*dOtjQtJx5%3K_0at-d z;GZWL`z7!Ka1MC;1B|^1+ynZ+bHGP{W5Cbf&sYJR0{-{}W4{9K0vhN7e|?;>-vYk@ zUId!Jr-8TL$JqCQmw_GNQ^23!%h;>HD?kA(0dKsAv2OvlfEAzvECGLcjIo!1Zvr{+ z6<`zS08O9)JPEvh9PI&0 z>&y<$p`1!dUZgRR4w@}5mrfcJ< zo^s~|EQ9sKnrlA8pR2MgV&yaF1#L8xbz`Cn+~1_<*#Yg+S{1i?L8oC_#6k;*Ic*ke z7b-9_G@G{sr<&8``zP^p{pRKBgj(kZN#r zbcQ8lh;?Z_3Fp2{j3!H1Tzlb1rf2A6}MY_d!*2fed4f`Dv3?6Be5&tuFxlT`2K<%*jrfkfk?1dtu+!O|J`;q zs#3F~*4lc(huEFe(LrN4Y9}&kLf1Pm{(Y^nmuPZmhx-B=9Dzphljr!()Tp9t3=2Z| z#ad@67B`TsVXJ|)stOcYLs{m{CL)i7F$FQ7gouu9y1NtZEMsLlRF^V0z})_D8o_K+ zm>;i78?WfJqw5=1gNT;P0x@P>`_m>(W8WMWmwMI-(5UYKr*^*{T+`m2k*G?oc8_4y zhF1hua?`aJa>if?tmCT*X7i~+!#KI-%G9t;gJ$=-)HiaMW0+=BfupBO!Q;TzuzZ<} z<}kZG)LtHj)OyKFBuOST7#Cn>uXkX57ZGLb{Y{>6(zRaUjkFt<_HZzOQ?7i$kuAobRrRF$i9S#D-p zuQQFpXJTfZKv3x_l35q^g8B}r?0$d@rni*%QXmcJ6AvZ`2lQD#*oYi3>X_BVjHV-$pJ0BZc3}V}}!O{r9>O~x5gYd1Vz*jLAiMne&vx(Bvm<+#6gCN4f z@EN1DuFM4@w|T*Wer(B|Fz2UCQ5%t$4w9cbP+_dxR_)iV*}SwnQxc7Z9?cerWSAA$ zH{un;2H1? bcm_NJo&nFm|04rSGR1$s=7;MC|5ETDvG6#O literal 0 HcmV?d00001 diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 99897e67..4bfd4dba 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -74,19 +74,19 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, template void ConvolutionLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { - if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) - Forward_gpu_batched(bottom, top); - else + //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) + //Forward_gpu_batched(bottom, top); + //else Forward_gpu_org(bottom, top); } template void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) + //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) Backward_gpu_batched(top, propagate_down, bottom); - else - Backward_gpu_org(top, propagate_down, bottom); + //else + //Backward_gpu_org(top, propagate_down, bottom); } template @@ -132,6 +132,7 @@ void ConvolutionLayer::Forward_gpu_org( //two intermediate variables to pass offset this->bottom_offset_ = bottom[i]->offset(n); this->top_offset_ = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_; this->forward_gpu_gemm(bottom_data, weight, top_data); if (this->bias_term_) { @@ -156,7 +157,6 @@ void ConvolutionLayer::Backward_gpu_batched(const vector*>& t // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count()); for (int n = 0; n < this->num_; ++n) { this->top_offset_ = top[i]->offset(n); this->backward_gpu_bias(bias_diff, top_diff); @@ -186,7 +186,6 @@ void ConvolutionLayer::Backward_gpu_batched(const vector*>& t } } } - } template void ConvolutionLayer::Backward_gpu_org(const vector*>& top, @@ -203,6 +202,7 @@ void ConvolutionLayer::Backward_gpu_org(const vector*>& top, // this->top_offset_ = top[i]->offset(n); this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * this->N_; this->backward_gpu_bias(bias_diff, top_diff); } } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index 241062a1..e9c07970 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -113,7 +113,7 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, cl_kernel Kernel = amdDevice.GetKernel(kernel_name); int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int num_kernels = channels * height * width; + int num_kernels = channels * height * width * optnum; cl_int ret; ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);